Source code for catalog.models

import datetime
import hashlib
import json
from pathlib import Path
import uuid
import zipfile

from django.conf import settings
from django.core.exceptions import ValidationError
from django.core.files.base import File
from django.core.serializers.json import DjangoJSONEncoder
from django.core.validators import MinLengthValidator
from django.db import models
from django.utils.module_loading import import_string
from django.utils.translation import gettext_lazy as _

from biospecdb import __version__
from explorer.models import Query
from uploader.base_models import DatedModel


class CustomDjangoJsonEncoder(DjangoJSONEncoder):
    item_separator = ',\n'


def empty_list():
    return []


def get_app_version(*args, **kwargs):
    return str(__version__)


[docs]class Dataset(DatedModel):
    """ Pre-canned dataset model.

        Attributes:
            id (:obj:`django.models.UUIDField`): The primary key for the dataset as stored in the website's main database.
            query (:obj:`django.models.ForeignKey` of :obj:`explorer.models.Query`): The query used to produce a dataset is itself modeled and stored in the database - this field is the query's name.
            sql (:obj:`django.models.TextField`): This is the SQL query used to generate the dataset from the database.
            version (:obj:`django.models.CharField`): Datasets are versioned to distinguish between those generated by the same query at different times in the lifetime of the database. I.e., different snapshots as the database contents grows.
            name (:obj:`django.models.CharField`): This is the name given to the dataset.
            description (:obj:`django.models.TextField`): A more verbose text describing the semantics and contents of this particular dataset.
            file (:obj:`django.models.FileField`): This is the link to the zip file. Click to download.
            app_version (:obj:`django.models.CharField`): The version of the application deployed and thus used to generate the dataset.
            sha256 (:obj:`django.models.CharField`): The SHA-256 checksum of the entire zip file.
            n_rows (:obj:`django.models.IntegerField`): The number of rows in the zipped data file. Depending on the query, this could be the total number of patients or something else.
            data_sha256 (:obj:`django.models.IntegerField`): The SHA-256 checksum of the data file archived within the zip file.
            spectral_data_filenames (:obj:`django.models.JSONField`): A list of all the file names for all individual spectral data files zipped within the downloadable zip file.
    """

    class Meta:
        verbose_name = "BSR dataset"
        get_latest_by = "updated_at"
        unique_together = [["name", "version"]]

    UPLOAD_DIR = "datasets/"  # MEDIA_ROOT/datasets

    # Cache objs.
    _file = None
    _filename = None

    id = models.UUIDField(unique=True, primary_key=True, default=uuid.uuid4, verbose_name="ID")
    query = models.ForeignKey(Query, on_delete=models.PROTECT, related_name="dataset")
    sql = models.TextField(blank=True, null=False, editable=False, verbose_name="SQL")
    version = models.CharField(max_length=32, null=False, blank=False, help_text="Version String, i.e., YYYY.N")
    name = models.CharField(max_length=32,
                            null=False,
                            blank=True,
                            help_text="If not provide the query title will be used")
    description = models.TextField(max_length=256,
                                   null=False,
                                   blank=True,
                                   help_text="If not provide the query description will be used")
    file = models.FileField(upload_to=UPLOAD_DIR,
                            editable=False,
                            null=False,
                            blank=True,
                            max_length=256)
    app_version = models.CharField(max_length=32,
                                   default=get_app_version,
                                   editable=False,
                                   null=False,
                                   blank=True,
                                   help_text="App version used to create data product")
    sha256 = models.CharField(max_length=64,
                              editable=False,
                              null=False,
                              blank=True,
                              verbose_name="SHA-256",
                              help_text="Checksum of downloadable file",
                              validators=[MinLengthValidator(64)])
    n_rows = models.IntegerField(null=False,
                                 blank=True,
                                 editable=False,
                                 help_text="Number of data rows")
    data_sha256 = models.CharField(max_length=64,
                                   editable=False,
                                   null=False,
                                   blank=True,
                                   verbose_name="Data SHA-256",
                                   help_text="Checksum of data table (not including any spectral data files).",
                                   validators=[MinLengthValidator(64)])
    spectral_data_filenames = models.JSONField(null=False,
                                               default=empty_list,
                                               blank=True,
                                               editable=False,
                                               help_text="List of spectral data filenames",
                                               encoder=CustomDjangoJsonEncoder)

    def __str__(self):
        return f"{self.name}_v{self.version}"

    def get_filename(self):
        """ Return filename. """
        return Path(str(self).replace('-', '_').replace('.', '_'))

    def clean(self, *args, **kwargs):
        """ Model validation. """
        self.name = self.name or self.query.title
        self.description = self.description or self.query.description
        self.sql = self.query.sql

        if not self.file:
            # Create file from query.
            file, info = self.execute_query()
            filename, n_rows, data_sha256, spectral_data_filenames = info

            if not n_rows:
                raise ValidationError(_("Query returned no data."))

            self._file = file
            self._filename = filename
            self.n_rows = n_rows
            self.data_sha256 = data_sha256
            self.spectral_data_filenames = spectral_data_filenames

        super().clean(*args, **kwargs)

    def get_exporter(self):
        """ Get exporter handler for generating downloadable file content. """
        return import_string(settings.DATASET_CATALOG_FILE_CLASS)

    def execute_query(self):
        """ Execute the SQL query against the database to produce the resultant data that is the cataloged dataset. """
        exporter = self.get_exporter()(self.query)
        output, info = exporter.get_file_output(always_zip=True,
                                                include_data_files=True,
                                                return_info=True)

        ext = Path(exporter.get_filename()).suffix
        filename = self.get_filename().with_suffix(ext)
        return output, (filename, *info)

    def compute_checksum(self):
        """ Compute Checksum of zipped dataset file. """
        if not self.file:
            return ''

        def _hash(fp):
            algorithm = hashlib.sha256()
            for chunk in fp.chunks():
                algorithm.update(chunk)
            return algorithm.hexdigest()

        if self.file.closed:
            with self.file.open() as fp:
                checksum = _hash(fp)
        else:
            # If already open, leave open, however, call open again to seek(0).
            self.file.open()
            checksum = _hash(self.file)
        return checksum

    def meta_info(self, **kwargs):
        """ Generate Meta Info """
        info = dict(name=self.name,
                    version=self.version,
                    description=self.description,
                    sql=self.sql,
                    data_sha256=self.data_sha256,
                    app_version=self.app_version,
                    id=str(self.id),
                    n_rows=self.n_rows,
                    n_spectral_data_files=len(self.spectral_data_filenames),
                    timestamp=str(datetime.datetime.now()),
                    spectral_data_filenames=self.spectral_data_filenames)
        info.update(kwargs)
        return info

    def save(self, *args, **kwargs):
        if self._file:
            # Append dataset meta data as INFO.json.
            with zipfile.ZipFile(self._file,
                                 mode='a',
                                 compression=import_string(settings.ZIP_COMPRESSION),
                                 compresslevel=settings.ZIP_COMPRESSION_LEVEL) as archive:
                archive.writestr("INFO.json", json.dumps(self.meta_info(),
                                                         indent=1,
                                                         cls=DjangoJSONEncoder))
            self.file = File(self._file, name=self._filename)

        # Create checksum.
        self.sha256 = self.compute_checksum()

        # Save file (and everything else).
        super().save(*args, **kwargs)

    def asave(self, *args, **kwargs):
        raise NotImplementedError

    def delete(self, *args, delete_files=True, **kwargs):
        count, deleted = super().delete(*args, **kwargs)
        if count == 1:
            if delete_files:
                self.file.storage.delete(self.file.name)
        return count, deleted

    def adelete(self, *args, **kwargs):
        raise NotImplementedError

    @classmethod
    def get_orphan_files(cls):
        """ Return list of orphaned files that exist within the file system but don't have entries in the database. """
        storage = cls.file.field.storage
        path = Path(cls.file.field.upload_to)
        # Collect all stored media files.
        try:
            fs_files = set([str(path / x) for x in storage.listdir(path)[1]])
        except FileNotFoundError:
            return storage, {}
        # Collect all media files referenced in the DB.
        data_files = set(x.file.name for x in cls.objects.all())
        # Compute orphaned file list.
        orphaned_files = fs_files - data_files
        return storage, orphaned_files