Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 40 additions & 18 deletions isimip_data/metadata/filters.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging

from django.contrib.postgres.search import SearchQuery
from django.core.exceptions import FieldError, ValidationError
from django.core.exceptions import ValidationError
from django.db.models import Q

from rest_framework.filters import BaseFilterBackend
Expand Down Expand Up @@ -35,7 +35,7 @@ def filter_queryset(self, request, queryset, view):
return queryset

dataset_ids = request.GET.getlist('dataset')
print(request.GET)

if dataset_ids:
queryset = queryset.filter(dataset_id__in=dataset_ids)

Expand Down Expand Up @@ -65,7 +65,14 @@ def filter_queryset(self, request, queryset, view):
if path_list:
q = Q()
for path in path_list:
q |= Q(path__startswith=path) | Q(links__path__startswith=path)
filter_kwargs = {'path__startswith': path}

if getattr(view, 'filter_resolve_links', True):
subquery = queryset.model.objects.filter(**filter_kwargs).values('root_id').order_by()
q |= Q(root_id__in=subquery)
else:
q |= Q(**filter_kwargs)

queryset = queryset.filter(q)

return queryset
Expand Down Expand Up @@ -94,9 +101,15 @@ def filter_queryset(self, request, queryset, view):

# last, perform a full text search on the search_vector field
if queryset.model == File:
queryset = queryset.filter(dataset__search__vector=search_query)
filter_kwargs = {'dataset__search__vector': search_query}
else:
queryset = queryset.filter(search__vector=search_query)
filter_kwargs = {'search__vector': search_query}

if getattr(view, 'filter_resolve_links', True):
subquery = queryset.model.objects.filter(**filter_kwargs).values('root_id').order_by()
queryset = queryset.filter(root_id__in=subquery)
else:
queryset = queryset.filter(**filter_kwargs)

return queryset

Expand All @@ -108,12 +121,10 @@ def filter_queryset(self, request, queryset, view):
return queryset

if request.GET.get('all') != 'true':
try:
# datasets have a public field
queryset = queryset.filter(public=True)
except FieldError:
# files need to check the public field of the corresponding dataset
if queryset.model == File:
queryset = queryset.filter(dataset__public=True)
else:
queryset = queryset.filter(public=True)

after = request.GET.get('after')
if after:
Expand All @@ -132,16 +143,19 @@ def filter_queryset(self, request, queryset, view):
if view.detail:
return queryset

# see https://docs.djangoproject.com/en/2.2/ref/contrib/postgres/fields/#std:fieldlookup-hstorefield.contains
# and https://docs.djangoproject.com/en/2.2/ref/contrib/postgres/fields/#containment-and-key-operations
# for optimal jsonb lookups: queryset.filter(field={'foo': 'bar', 'egg': 'spam'})
for identifier in Identifier.objects.using('metadata').identifiers():
if identifier != getattr(view, 'identifier_filter_exclude', None):
if identifier != getattr(view, 'filter_exclude_identifier', None):
q = Q()
for value in request.GET.getlist(identifier):
if value:
q |= Q(specifiers__contains={identifier: value})
q |= Q(links__specifiers__contains={identifier: value})
filter_kwargs = {'specifiers__contains': {identifier: value}}

if getattr(view, 'filter_resolve_links', True):
subquery = queryset.model.objects.filter(**filter_kwargs).values('root_id').order_by()
q |= Q(root_id__in=subquery)
else:
q |= Q(**filter_kwargs)

queryset = queryset.filter(q)

return queryset
Expand All @@ -157,10 +171,18 @@ def filter_queryset(self, request, queryset, view):
if tree_list:
q = Q()
for tree in tree_list:
tree = tree.rstrip('/') + '/'

if queryset.model == File:
q |= Q(dataset__tree_path__startswith=tree) | Q(dataset__links__tree_path__startswith=tree)
filter_kwargs = {'dataset__tree_path__startswith': tree}
else:
filter_kwargs = {'tree_path__startswith': tree}

if getattr(view, 'filter_resolve_links', True):
subquery = queryset.model.objects.filter(**filter_kwargs).values('root_id').order_by()
q |= Q(root_id__in=subquery)
else:
q |= Q(tree_path__startswith=tree) | Q(links__tree_path__startswith=tree)
q |= Q(**filter_kwargs)

queryset = queryset.filter(q)

Expand Down
9 changes: 7 additions & 2 deletions isimip_data/metadata/managers.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
from django.contrib.postgres.aggregates import ArrayAgg
from django.db import models
from django.db.models.fields.json import KeyTextTransform


class DatasetQuerySet(models.QuerySet):

def histogram(self, identifier):
field = f'specifiers__{identifier}'
return self.values_list(field).annotate(count=models.Count(field)).order_by(field)
return (
self.annotate(specifier=KeyTextTransform(identifier, 'specifiers'))
.values_list('specifier')
.annotate(count=models.Count('root_id', distinct=True))
.order_by('specifier')
)


class DatasetManager(models.Manager):
Expand Down
47 changes: 22 additions & 25 deletions isimip_data/metadata/middleware.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
import re
from datetime import UTC

from django.db.models import Max
from django.middleware.cache import CacheMiddleware
from django.utils.cache import learn_cache_key
from django.utils.timezone import make_aware

from .models import Dataset, Resource
from .models import Dataset, File, Resource


class MetadataCacheMiddleware(CacheMiddleware):
Expand All @@ -23,19 +21,19 @@ class MetadataCacheMiddleware(CacheMiddleware):
)

def process_request(self, request):
self.update = self.check_path_info(request.path_info, self.update_patterns)
self.path = self.check_path_info(request.path_info, self.path_patterns)
request._in_update_patterns = self._check_path_info(request.path_info, self.update_patterns)
request._in_path_patterns = self._check_path_info(request.path_info, self.path_patterns)

if self.update or self.path:
self.update_cache()
if request._in_update_patterns or request._in_path_patterns:
self._check_cache_invalidation()

if self.path:
if request._in_path_patterns:
return super().process_request(request)
else:
return None

def process_response(self, request, response):
if self.path:
if getattr(request, '_in_path_patterns', False):
# this is a limited version of process_response in UpdateCacheMiddleware
# which does not set the headers to let the client cache the response as well
if not self._should_update_cache(request, response):
Expand All @@ -48,31 +46,30 @@ def process_response(self, request, response):

return response

def check_path_info(self, path_info, patterns):
def _check_path_info(self, path_info, patterns):
return any(pattern.search(path_info) for pattern in patterns)

def update_cache(self):
def _check_cache_invalidation(self):
# skip check if we checked recently
if self.cache.get('invalidation_checked'):
return

# get the cache_timestamp from the cache
cache_timestamp = self.cache.get('timestamp')

# get the latest timestamp from the datasets and resources table
timestamp_values = [
make_aware(value, UTC) for value in Dataset.objects.using('metadata').aggregate(
Max('created'),
Max('updated'),
Max('published'),
Max('archived')
).values() if value is not None
] + [
make_aware(value, UTC) for value in Resource.objects.using('metadata').aggregate(
Max('created'),
Max('updated')
).values() if value is not None
timestamps = [
Dataset.objects.using('metadata').aggregate(latest=Max('last_changed'))['latest'],
File.objects.using('metadata').aggregate(latest=Max('last_changed'))['latest'],
Resource.objects.using('metadata').aggregate(latest=Max('last_changed'))['latest'],
]
timestamp = max(timestamp_values) if timestamp_values else None
timestamp = max(t for t in timestamps if t is not None)

# check if the timestamp is later than cache_timestamp
if cache_timestamp is None or timestamp is None or timestamp > cache_timestamp:
if cache_timestamp is None or timestamp > cache_timestamp:
# the datasets table has changed, clear the cache and set a new timestamp
self.cache.clear()
self.cache.set('timestamp', timestamp, self.cache_timeout)

# mark that we just checked, regardless of whether we invalidated
self.cache.set('invalidation_checked', True, 30)
7 changes: 7 additions & 0 deletions isimip_data/metadata/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ class Dataset(models.Model):
updated = models.DateTimeField()
published = models.DateTimeField()
archived = models.DateTimeField()
last_changed = models.DateTimeField()

root_id = models.UUIDField(editable=False)

class Meta:
db_table = 'datasets'
Expand Down Expand Up @@ -130,6 +133,9 @@ class File(models.Model):

created = models.DateTimeField()
updated = models.DateTimeField()
last_changed = models.DateTimeField()

root_id = models.UUIDField(editable=False)

class Meta:
db_table = 'files'
Expand Down Expand Up @@ -231,6 +237,7 @@ class Resource(models.Model):

created = models.DateTimeField()
updated = models.DateTimeField()
last_changed = models.DateTimeField()

datasets = models.ManyToManyField(Dataset, related_name='resources')

Expand Down
45 changes: 21 additions & 24 deletions isimip_data/metadata/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,34 +205,31 @@ def get_filelist_url(self, obj):
return reverse('dataset-detail-filelist', args=[obj.id], request=self.context.get('request'))

def get_caveats(self, obj):
if self.context.get('request').GET.get('caveats'):
user = self.context['request'].user
queryset = Caveat.objects.exclude(public=False) \
.filter(datasets__contains=[obj.id]).public(user)
serializer = DatasetCaveatSerializer(queryset, many=True)
return serializer.data
else:
return []
caveats = [
caveat
for caveat in self.context.get('caveats', [])
if obj.id in caveat.datasets
]
serializer = DatasetCaveatSerializer(caveats, many=True)
return serializer.data

def get_caveats_versions(self, obj):
if self.context.get('request').GET.get('caveats'):
user = self.context['request'].user
versions = Dataset.objects.using('metadata').filter(path=obj.path).exclude(id=obj.id)
queryset = Caveat.objects.exclude(public=False) \
.exclude(datasets__contains=[obj.id]) \
.filter(datasets__overlap=[version.id for version in versions]).public(user)
serializer = DatasetCaveatSerializer(queryset, many=True)
return serializer.data
else:
return []
caveats = [
caveat
for caveat in self.context.get('caveats_versions', [])
if self.context.get('versions', {}).get(obj.path).intersection(caveat.datasets)
]
serializer = DatasetCaveatSerializer(caveats, many=True)
return serializer.data

def get_annotations(self, obj):
if self.context.get('request').GET.get('annotations'):
queryset = Annotation.objects.filter(datasets__contains=[obj.id])
serializer = DatasetAnnotationSerializer(queryset, many=True)
return serializer.data
else:
return []
annotations = [
annotation
for annotation in self.context.get('annotations', [])
if obj.id in annotation.datasets
]
serializer = DatasetAnnotationSerializer(annotations, many=True)
return serializer.data


class FileLinkSerializer(serializers.ModelSerializer):
Expand Down
46 changes: 9 additions & 37 deletions isimip_data/metadata/sitemaps.py
Original file line number Diff line number Diff line change
@@ -1,64 +1,36 @@
from django.contrib.sitemaps import Sitemap
from django.db.models.functions import Greatest

from .models import Dataset, File, Resource


class DatasetSitemap(Sitemap):
changefreq = 'never'
limit = 50_000
limit = 10_000

def items(self):
return (
Dataset.objects
.using('metadata')
.order_by('id')
.annotate(last_changed=Greatest('created', 'updated', 'published', 'archived'))
.values('id', 'last_changed')
)
return Dataset.objects.using('metadata').order_by('id').only('id', 'last_changed')

def lastmod(self, obj):
return obj['last_changed']

def location(self, obj):
return f'/datasets/{obj["id"]}/'
return obj.last_changed


class FileSitemap(Sitemap):
changefreq = 'never'
limit = 50_000
limit = 10_000

def items(self):
return (
File.objects
.using('metadata')
.order_by('id')
.annotate(last_changed=Greatest('created', 'updated'))
.values('id', 'last_changed')
)
return File.objects.using('metadata').order_by('id').only('id', 'last_changed')

def lastmod(self, obj):
return obj['last_changed']

def location(self, obj):
return f'/files/{obj["id"]}/'
return obj.last_changed


class ResourceSitemap(Sitemap):
changefreq = 'never'
limit = 50_000
limit = 10_000

def items(self):
return (
Resource.objects
.using('metadata')
.order_by('id')
.annotate(last_changed=Greatest('created', 'updated'))
.values('id', 'doi', 'last_changed')
)
return Resource.objects.using('metadata').order_by('id').only('id', 'last_changed')

def lastmod(self, obj):
return obj['last_changed']

def location(self, obj):
return f'/{obj["doi"]}/'
return obj.last_changed
Loading