Source code for renku.core.models.entities

# -*- coding: utf-8 -*-
#
# Copyright 2018-2021- Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Represent provenance entities."""

import os
import pathlib
import weakref
from urllib.parse import quote, urljoin

import attr

from renku.core.management.command_builder.command import inject
from renku.core.models.calamus import JsonLDSchema, Nested, fields, prov, rdfs, renku, schema, wfprov
from renku.core.models.projects import Project, ProjectSchema


def _str_or_none(data):
    """Return str representation or None."""
    return str(data) if data is not None else data


@attr.s(eq=False, order=False)
class CommitMixin:
    """Represent a commit mixin."""

    commit = attr.ib(default=None, kw_only=True)
    client = attr.ib(default=None, kw_only=True)
    path = attr.ib(default=None, kw_only=True, converter=_str_or_none)

    _id = attr.ib(default=None, kw_only=True)
    _label = attr.ib(kw_only=True)
    _project = attr.ib(type=Project, kw_only=True, default=None)

    @property
    def submodules(self):
        """Proxy to client submodules."""
        if self.client:
            return self.client.submodules

    def default_id(self):
        """Configure calculated ID."""
        hexsha = self.commit.hexsha if self.commit else "UNCOMMITTED"
        return generate_file_id(client=self.client, hexsha=hexsha, path=self.path)

    @_label.default
    def default_label(self):
        """Generate a default label."""
        if self.commit:
            hexsha = self.commit.hexsha
        else:
            hexsha = "UNCOMMITTED"
        if self.path:
            path = self.path
            if self.client and os.path.isabs(path):
                path = pathlib.Path(path).relative_to(self.client.path)
            return generate_label(path, hexsha)
        return hexsha

    def __attrs_post_init__(self):
        """Post-init hook."""
        if self.path and self.client:
            path = pathlib.Path(self.path)
            if path.is_absolute():
                self.path = str(path.relative_to(self.client.path))

        # always force "project" to be the current project
        if self.client:
            self._project = self.client.project

        if not self._id:
            self._id = self.default_id()


[docs]@attr.s(eq=False, order=False) class Entity(CommitMixin): """Represent a data value or item.""" _parent = attr.ib( default=None, kw_only=True, converter=lambda value: weakref.ref(value) if value is not None else None ) checksum = attr.ib(default=None, kw_only=True, type=str)
[docs] @classmethod @inject.params(client="LocalClient") def from_revision(cls, client, path, revision="HEAD", parent=None, find_previous=True, **kwargs): """Return dependency from given path and revision.""" if find_previous: revision = client.find_previous_commit(path, revision=revision) client, commit, path = client.resolve_in_submodules(revision, path) path_ = client.path / path if path != "." and path_.is_dir(): entity = Collection(client=client, commit=commit, path=path, members=[], parent=parent) files_in_commit = commit.stats.files # update members with commits for member in path_.iterdir(): if member.name == ".gitkeep": continue member_path = str(member.relative_to(client.path)) find_previous = True if member_path in files_in_commit: # we already know the newest commit, no need to look it up find_previous = False try: assert all(member_path != m.path for m in entity.members) entity.members.append( cls.from_revision( client, member_path, commit, parent=entity, find_previous=find_previous, **kwargs ) ) except KeyError: pass else: entity = cls(client=client, commit=commit, path=str(path), parent=parent, **kwargs) return entity
@property def parent(self): # pragma: no cover """Return the parent object.""" return self._parent() if self._parent is not None else None @property def entities(self): """Yield itself.""" if self.client and not self.commit and self._label and "@UNCOMMITTED" not in self._label: self.commit = self.client.repo.commit(self._label.rsplit("@", maxsplit=1)[-1]) yield self
[docs] def set_client(self, client): """Sets the clients on this entity.""" self.client = client
[docs]@attr.s(eq=False, order=False) class Collection(Entity): """Represent a directory with files.""" members = attr.ib(kw_only=True, default=None)
[docs] def default_members(self): """Generate default members as entities from current path.""" if not self.client: return [] dir_path = self.client.path / self.path if not dir_path.exists(): # likely a directory deleted in a previous commit return [] assert dir_path.is_dir() members = [] for path in dir_path.iterdir(): if path.name == ".gitkeep": continue # ignore empty directories in Git repository cls = Collection if path.is_dir() else Entity members.append( cls(commit=self.commit, client=self.client, path=str(path.relative_to(self.client.path)), parent=self) ) return members
@property def entities(self): """Recursively return all files.""" for member in self.members: if not member.client and self.client: member.client = self.client yield from member.entities if self.client and not self.commit and self._label and "@UNCOMMITTED" not in self._label: self.commit = self.client.repo.commit(self._label.rsplit("@", maxsplit=1)[-1]) yield self
[docs] def set_client(self, client): """Sets the clients on this entity.""" super().set_client(client) for m in self.members: m.set_client(client)
def __attrs_post_init__(self): """Init members.""" super().__attrs_post_init__() if self.members is None: self.members = self.default_members() for member in self.members: member._parent = weakref.ref(self)
class CommitMixinSchema(JsonLDSchema): """CommitMixin schema.""" class Meta: """Meta class.""" model = CommitMixin path = fields.String(prov.atLocation) _id = fields.Id(init_name="id") _label = fields.String(rdfs.label, init_name="label", missing=None) _project = Nested(schema.isPartOf, ProjectSchema, init_name="project", missing=None) class EntitySchema(CommitMixinSchema): """Entity Schema.""" class Meta: """Meta class.""" rdf_type = [prov.Entity, wfprov.Artifact] model = Entity checksum = fields.String(renku.checksum, missing=None) class CollectionSchema(EntitySchema): """Entity Schema.""" class Meta: """Meta class.""" rdf_type = [prov.Collection] model = Collection members = Nested(prov.hadMember, [EntitySchema, "CollectionSchema"], many=True) def generate_label(path, hexsha): """Generate label field.""" return f"{path}@{hexsha}" def generate_file_id(client, hexsha, path): """Generate DatasetFile id field.""" # Determine the hostname for the resource URIs. # If RENKU_DOMAIN is set, it overrides the host from remote. # Default is localhost. host = "localhost" if client: host = client.remote.get("host") or host host = os.environ.get("RENKU_DOMAIN") or host # TODO: Use plural name for entity id: /blob/ -> /blobs/ # always set the id by the identifier return urljoin(f"https://{host}", pathlib.posixpath.join(f"/blob/{hexsha}/{quote(str(path))}"))