diff --git a/contrib/postactivate b/contrib/postactivate index 9482463..b3634e7 100755 --- a/contrib/postactivate +++ b/contrib/postactivate @@ -29,6 +29,10 @@ function manage_with_settings() { PYTHONPATH="$PYPLN_ROOT:$PYTHONPATH" python "$PYPLN_ROOT"/manage.py $* --settings=pypln.web.settings.$SETTINGS; } +function manage() { + PYTHONPATH="$PYPLN_ROOT:$PYTHONPATH" python "$PYPLN_ROOT"/manage.py $* +} + alias manage_dev="manage_with_settings development" alias manage_test="manage_with_settings test" alias run_tests="manage_test test pypln.web.core.tests" diff --git a/pypln/web/backend_adapter/pipelines.py b/pypln/web/backend_adapter/pipelines.py index 4ffbf3f..f77505e 100644 --- a/pypln/web/backend_adapter/pipelines.py +++ b/pypln/web/backend_adapter/pipelines.py @@ -44,6 +44,10 @@ def create_indexing_pipeline(doc): {"index_name": doc.index_name, "doc_type": doc.doc_type}}) (Extractor().si(doc_id) | ElasticIndexer().si(doc_id))() +def calculate_corpus_freqdist(corpus): + blob_ids = map(ObjectId, corpus.document_set.values_list('blob', flat=True)) + CorpusFreqDist().delay(corpus.pk, blob_ids) + def get_config_from_router(api, timeout=5): client = Client() client.connect(api) diff --git a/pypln/web/backend_adapter/tests.py b/pypln/web/backend_adapter/tests.py index 28e18a8..936963e 100644 --- a/pypln/web/backend_adapter/tests.py +++ b/pypln/web/backend_adapter/tests.py @@ -26,13 +26,13 @@ from mock import patch from pypln.web.backend_adapter.pipelines import (create_indexing_pipeline, - call_default_pipeline, create_pipeline_from_document) -from pypln.web.core.models import IndexedDocument, Document, mongodb_storage + call_default_pipeline, create_pipeline_from_document, calculate_corpus_freqdist) +from pypln.web.core.models import IndexedDocument, Document, mongodb_storage, Corpus from pypln.web.core.tests.utils import TestWithMongo __all__ = ["CreatePipelineTest", "CreateIndexingPipelineTest", - "CreatePipelineFromDocumentTest"] + "CreatePipelineFromDocumentTest", "CorpusFreqDistTest"] class CreatePipelineTest(TestWithMongo): @@ -68,14 +68,14 @@ def test_should_create_indexing_pipelines_for_document(self, extractor): extractor.assert_called_with() extractor.return_value.si.assert_called_with(ObjectId(self.document.blob.name)) - @patch('pypln.web.backend_adapter.pipelines.GridFSDataRetriever', autospec=True) + @patch('pypln.web.backend_adapter.pipelines.Extractor', autospec=True) def test_should_add_index_name_to_the_document_in_mongo(self, gridfs_data_retriever): create_indexing_pipeline(self.document) mongo_document = self.get_mongo_doc(self.document) self.assertEqual(mongo_document['index_name'], self.document.index_name) - @patch('pypln.web.backend_adapter.pipelines.GridFSDataRetriever', autospec=True) + @patch('pypln.web.backend_adapter.pipelines.Extractor', autospec=True) def test_should_add_doc_type_to_the_document_in_mongo(self, gridfs_data_retriever): create_indexing_pipeline(self.document) @@ -91,3 +91,16 @@ def test_create_pipeline_from_document_instantiates_a_document_id(self, fake_cal doc = Document.objects.all()[0] create_pipeline_from_document(doc) fake_call_default_pipeline.assert_called_with(ObjectId(doc.blob.name)) + + +class CorpusFreqDistTest(TestWithMongo): + fixtures = ['users', 'corpora', 'documents'] + + @patch('pypln.web.backend_adapter.pipelines.CorpusFreqDist', autospec=True) + def test_should_call_CorpusFreqDist_with_document_ids(self, + corpus_freqdist_worker): + corpus = Corpus.objects.get(pk=2) + ids = [ObjectId("562526d9798ebd4616b23bb1")] + calculate_corpus_freqdist(corpus) + corpus_freqdist_worker.assert_called_with() + corpus_freqdist_worker.return_value.delay.assert_called_with(corpus.pk, ids) diff --git a/pypln/web/core/fixtures/mongodb/corpora_analysis.json b/pypln/web/core/fixtures/mongodb/corpora_analysis.json new file mode 100644 index 0000000..ff77cb0 --- /dev/null +++ b/pypln/web/core/fixtures/mongodb/corpora_analysis.json @@ -0,0 +1,16 @@ +[ + { + "_id" : { "$oid": "5785005257bc3a1070d8cdbf" }, + "corpus_id" : 2, + "freqdist" : [ + [ "á", 1 ], + [ "non-ascii", 1 ], + [ ".", 1 ], + [ "char", 1 ], + [ "file", 1 ], + [ "test", 1 ], + [ ":", 1 ], + [ "with", 1 ] + ] + } +] diff --git a/pypln/web/core/models.py b/pypln/web/core/models.py index df4e177..6808d12 100644 --- a/pypln/web/core/models.py +++ b/pypln/web/core/models.py @@ -21,6 +21,7 @@ from django.contrib.auth.models import User from django.dispatch import receiver from django.db import models +import pymongo from rest_framework.reverse import reverse from rest_framework.authtoken.models import Token @@ -28,6 +29,7 @@ from pypln.web.core.storage import MongoDBBase64Storage mongodb_storage = MongoDBBase64Storage() +corpus_collection = pymongo.Connection(host=settings.MONGODB_URIS)[settings.MONGODB_DBNAME][settings.MONGODB_CORPORA_COLLECTION] class Corpus(models.Model): @@ -43,6 +45,13 @@ class Meta: def __unicode__(self): return self.name + @property + def properties(self): + corpus_analysis = corpus_collection.find_one({"corpus_id": self.id}) + if corpus_analysis is None: + return {} + return corpus_analysis + class Document(models.Model): blob = models.FileField(upload_to='/', storage=mongodb_storage) diff --git a/pypln/web/core/storage.py b/pypln/web/core/storage.py index abd5130..f4103d1 100644 --- a/pypln/web/core/storage.py +++ b/pypln/web/core/storage.py @@ -28,7 +28,6 @@ from django.conf import settings from django.utils.encoding import filepath_to_uri from pymongo import Connection -from gridfs import GridFS, NoFile class MongoDBBase64Storage(Storage): diff --git a/pypln/web/core/tests/test_models.py b/pypln/web/core/tests/test_models.py index 753be0d..8f039fe 100644 --- a/pypln/web/core/tests/test_models.py +++ b/pypln/web/core/tests/test_models.py @@ -27,7 +27,7 @@ from pypln.web.core.models import Corpus, Document from pypln.web.core.tests.utils import TestWithMongo -__all__ = ["CorpusModelTest", "DocumentModelTest"] +__all__ = ["CorpusModelTest", "CorpusPropertiesTest", "DocumentModelTest"] class CorpusModelTest(TestCase): fixtures = ['users'] @@ -46,6 +46,25 @@ def test_different_users_can_have_corpora_with_the_same_name(self): self.assertEqual(corpus_1.name, corpus_2.name) +class CorpusPropertiesTest(TestWithMongo): + fixtures = ['users', 'corpora', 'corpora_analysis'] + + def test_returns_keyerror_when_key_does_not_exist(self): + expected_data = u'Test file with non-ascii char: á.' + corpus = Corpus.objects.all()[0] + with self.assertRaises(KeyError): + corpus.properties['analysis_that_does_not_exist'] + + def test_get_freqdist_from_store(self): + expected_data = [ + [u"á", 1], [u"non-ascii", 1], [u".", 1], + [u"char", 1], [u"file", 1], [u"test", 1], [u":", 1], + [u"with", 1 ] + ] + corpus = Corpus.objects.get(pk=2) + self.assertEqual(corpus.properties['freqdist'], expected_data) + + class DocumentModelTest(TestWithMongo): fixtures = ['users', 'corpora', 'documents'] diff --git a/pypln/web/core/tests/utils.py b/pypln/web/core/tests/utils.py index 4ed906f..43fb1d2 100644 --- a/pypln/web/core/tests/utils.py +++ b/pypln/web/core/tests/utils.py @@ -42,6 +42,13 @@ def _pre_setup(self, *args, **kwargs): mongodb_storage.save(os.path.basename(doc.blob.name), StringIO(u"Test file with non-ascii char: á.".encode('utf-8'))) + if hasattr(self, 'fixtures') and self.fixtures is not None and 'corpora_analysis' in self.fixtures: + filename = os.path.join(settings.PROJECT_ROOT, 'core/fixtures/mongodb/corpora_analysis.json') + with open(filename, 'r') as mongo_fixture: + for obj in json_util.loads(mongo_fixture.read()): + mongodb_storage._connection[settings.MONGODB_DBNAME][settings.MONGODB_CORPORA_COLLECTION].insert(obj, w=1) + + def _post_teardown(self, *args, **kwargs): mongodb_storage._connection.drop_database(mongodb_storage._db.name) super(TestWithMongo, self)._post_teardown(*args, **kwargs) diff --git a/pypln/web/core/tests/views/test_corpus_analysis.py b/pypln/web/core/tests/views/test_corpus_analysis.py new file mode 100644 index 0000000..dcc85de --- /dev/null +++ b/pypln/web/core/tests/views/test_corpus_analysis.py @@ -0,0 +1,106 @@ +# -*- coding:utf-8 -*- +# +# Copyright 2012 NAMD-EMAP-FGV +# +# This file is part of PyPLN. You can get more information at: http://pypln.org/. +# +# PyPLN is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# PyPLN is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with PyPLN. If not, see . +import json + +from django.contrib.auth.models import User +from django.core.urlresolvers import reverse +from mock import patch + +from pypln.web.core.models import Corpus, User +from pypln.web.core.tests.utils import TestWithMongo + +__all__ = ["CorpusFreqDistViewTest"] + + +class CorpusFreqDistViewTest(TestWithMongo): + fixtures = ['users', 'corpora', 'documents', 'corpora_analysis'] + + def test_requires_login(self): + response = self.client.get(reverse('corpus-freqdist', + kwargs={'pk': 2})) + self.assertEqual(response.status_code, 403) + + def test_returns_404_for_inexistent_corpus(self): + self.client.login(username="user", password="user") + response = self.client.get(reverse('corpus-freqdist', + kwargs={'pk': 9999})) + self.assertEqual(response.status_code, 404) + + def test_returns_404_if_user_is_not_the_owner_of_the_corpus(self): + self.client.login(username="user", password="user") + corpus = Corpus.objects.filter(owner__username="admin")[0] + response = self.client.get(reverse('corpus-freqdist', + kwargs={'pk': corpus.id})) + self.assertEqual(response.status_code, 404) + + def test_returns_404_if_corpus_has_no_freqdist_yet(self): + self.client.login(username="admin", password="admin") + corpus = Corpus.objects.filter(owner__username="admin")[0] + response = self.client.get(reverse('corpus-freqdist', + kwargs={'pk': corpus.id})) + self.assertEqual(response.status_code, 404) + + def test_shows_corpus_freqdist_correctly(self): + self.client.login(username="user", password="user") + corpus = Corpus.objects.filter(owner__username="user")[0] + response = self.client.get(reverse('corpus-freqdist', + kwargs={'pk': corpus.id})) + + self.assertEqual(response.status_code, 200) + self.assertEqual(response.renderer_context['view'].get_object(), + corpus) + expected_data = corpus.properties['freqdist'] + self.assertEqual(response.data['value'], expected_data) + + @patch('pypln.web.core.views.calculate_corpus_freqdist') + def test_queue_freqdist_analysis_for_a_corpus_that_still_does_not_have_one(self, + calculate_corpus_freqdist): + """ + This is a regression test. There used to be a bug that returned 404 + before queueing the analysis if the corpus didn't have a freqdist + analysis yet. + """ + self.user = User.objects.get(username="admin") + self.client.login(username="admin", password="admin") + + corpus = self.user.corpus_set.all()[0] + response = self.client.put(reverse('corpus-freqdist', + kwargs={"pk": corpus.id})) + + self.assertFalse(corpus.properties.has_key("freqdist")) + + self.assertEqual(response.status_code, 200) + self.assertTrue(calculate_corpus_freqdist.called) + calculate_corpus_freqdist.assert_called_with(corpus) + + @patch('pypln.web.core.views.calculate_corpus_freqdist') + def test_queue_freqdist_analysis_for_a_corpus_that_has_one(self, + calculate_corpus_freqdist): + self.user = User.objects.get(username="user") + self.client.login(username="user", password="user") + + corpus = self.user.corpus_set.all()[0] + response = self.client.put(reverse('corpus-freqdist', + kwargs={"pk": corpus.id})) + + self.assertTrue(corpus.properties.has_key("freqdist")) + + self.assertEqual(response.status_code, 200) + self.assertTrue(calculate_corpus_freqdist.called) + calculate_corpus_freqdist.assert_called_with(corpus) diff --git a/pypln/web/core/urls.py b/pypln/web/core/urls.py index 77206cd..c893fa8 100644 --- a/pypln/web/core/urls.py +++ b/pypln/web/core/urls.py @@ -20,6 +20,7 @@ from django.conf.urls import patterns, url, include from rest_framework.urlpatterns import format_suffix_patterns from pypln.web.core.views import CorpusList, CorpusDetail, CorpusDocumentList +from pypln.web.core.views import CorpusFreqDist from pypln.web.core.views import DocumentList, DocumentDetail from pypln.web.core.views import PropertyList, PropertyDetail @@ -28,6 +29,8 @@ url(r'^user/api-token/$', 'auth_token', name='auth_token'), url(r'^corpora/$', CorpusList.as_view(), name='corpus-list'), url(r'^corpora/(?P\d+)/$', CorpusDetail.as_view(), name='corpus-detail'), + url(r'^corpora/(?P\d+)/freqdist/$', CorpusFreqDist.as_view(), + name='corpus-freqdist'), url(r'^corpora/(?P\d+)/documents/$', CorpusDocumentList.as_view(), name='corpus-document-list'), url(r'^documents/$', DocumentList.as_view(), name='document-list'), diff --git a/pypln/web/core/views.py b/pypln/web/core/views.py index fc10cd9..f4b0600 100644 --- a/pypln/web/core/views.py +++ b/pypln/web/core/views.py @@ -28,7 +28,8 @@ from rest_framework.response import Response from rest_framework import serializers -from pypln.web.backend_adapter.pipelines import create_pipeline_from_document +from pypln.web.backend_adapter.pipelines import (create_pipeline_from_document, + calculate_corpus_freqdist) from pypln.web.core.models import Corpus, Document from pypln.web.core.serializers import CorpusSerializer, DocumentSerializer from pypln.web.core.serializers import PropertyListSerializer @@ -116,6 +117,38 @@ def get_queryset(self): def perform_update(self, serializer): instance = serializer.save(owner=self.request.user) +class CorpusFreqDist(generics.RetrieveUpdateAPIView): + """ + Shows FreqDist for the corpus + + `GET` requests will show the last calculated FreqDist for the corpus + + `PUT` requests will queue a new task for calculating the Corpus + FreqDist using the documents currently contained in the corpus + + """ + model = Corpus + permission_classes = (permissions.IsAuthenticated, ) + + class CorpusFreqDistSerializer(serializers.Serializer): + value = serializers.ReadOnlyField(source="properties.freqdist") + + serializer_class = CorpusFreqDistSerializer + + def get_queryset(self): + return Corpus.objects.filter(owner=self.request.user) + + def retrieve(self, *args, **kwargs): + corpus = self.get_object() + if corpus.properties.has_key("freqdist"): + return super(CorpusFreqDist, self).retrieve(self, *args, **kwargs) + else: + raise Http404("FreqDist for Corpus {} is not yet available".format(corpus)) + + def perform_update(self, serializer): + calculate_corpus_freqdist(serializer.instance) + + class DocumentList(generics.ListCreateAPIView): """ Lists all documents available to the current user and creates new documents. diff --git a/pypln/web/settings.py b/pypln/web/settings.py index 9c36239..36152b6 100644 --- a/pypln/web/settings.py +++ b/pypln/web/settings.py @@ -58,6 +58,7 @@ def split_uris(uri): MONGODB_DBNAME = config('MONGODB_DBNAME', default='pypln') MONGODB_COLLECTION = config('MONGODB_COLLECTION', default='analysis') +MONGODB_CORPORA_COLLECTION = config('MONGODB_CORPORA_COLLECTION', default='corpora_analysis') ALLOWED_HOSTS = config('ALLOWED_HOSTS', cast=Csv())