diff --git a/contrib/postactivate b/contrib/postactivate
index 9482463..b3634e7 100755
--- a/contrib/postactivate
+++ b/contrib/postactivate
@@ -29,6 +29,10 @@ function manage_with_settings() {
PYTHONPATH="$PYPLN_ROOT:$PYTHONPATH" python "$PYPLN_ROOT"/manage.py $* --settings=pypln.web.settings.$SETTINGS;
}
+function manage() {
+ PYTHONPATH="$PYPLN_ROOT:$PYTHONPATH" python "$PYPLN_ROOT"/manage.py $*
+}
+
alias manage_dev="manage_with_settings development"
alias manage_test="manage_with_settings test"
alias run_tests="manage_test test pypln.web.core.tests"
diff --git a/pypln/web/backend_adapter/pipelines.py b/pypln/web/backend_adapter/pipelines.py
index 4ffbf3f..f77505e 100644
--- a/pypln/web/backend_adapter/pipelines.py
+++ b/pypln/web/backend_adapter/pipelines.py
@@ -44,6 +44,10 @@ def create_indexing_pipeline(doc):
{"index_name": doc.index_name, "doc_type": doc.doc_type}})
(Extractor().si(doc_id) | ElasticIndexer().si(doc_id))()
+def calculate_corpus_freqdist(corpus):
+ blob_ids = map(ObjectId, corpus.document_set.values_list('blob', flat=True))
+ CorpusFreqDist().delay(corpus.pk, blob_ids)
+
def get_config_from_router(api, timeout=5):
client = Client()
client.connect(api)
diff --git a/pypln/web/backend_adapter/tests.py b/pypln/web/backend_adapter/tests.py
index 28e18a8..936963e 100644
--- a/pypln/web/backend_adapter/tests.py
+++ b/pypln/web/backend_adapter/tests.py
@@ -26,13 +26,13 @@
from mock import patch
from pypln.web.backend_adapter.pipelines import (create_indexing_pipeline,
- call_default_pipeline, create_pipeline_from_document)
-from pypln.web.core.models import IndexedDocument, Document, mongodb_storage
+ call_default_pipeline, create_pipeline_from_document, calculate_corpus_freqdist)
+from pypln.web.core.models import IndexedDocument, Document, mongodb_storage, Corpus
from pypln.web.core.tests.utils import TestWithMongo
__all__ = ["CreatePipelineTest", "CreateIndexingPipelineTest",
- "CreatePipelineFromDocumentTest"]
+ "CreatePipelineFromDocumentTest", "CorpusFreqDistTest"]
class CreatePipelineTest(TestWithMongo):
@@ -68,14 +68,14 @@ def test_should_create_indexing_pipelines_for_document(self, extractor):
extractor.assert_called_with()
extractor.return_value.si.assert_called_with(ObjectId(self.document.blob.name))
- @patch('pypln.web.backend_adapter.pipelines.GridFSDataRetriever', autospec=True)
+ @patch('pypln.web.backend_adapter.pipelines.Extractor', autospec=True)
def test_should_add_index_name_to_the_document_in_mongo(self,
gridfs_data_retriever):
create_indexing_pipeline(self.document)
mongo_document = self.get_mongo_doc(self.document)
self.assertEqual(mongo_document['index_name'], self.document.index_name)
- @patch('pypln.web.backend_adapter.pipelines.GridFSDataRetriever', autospec=True)
+ @patch('pypln.web.backend_adapter.pipelines.Extractor', autospec=True)
def test_should_add_doc_type_to_the_document_in_mongo(self,
gridfs_data_retriever):
create_indexing_pipeline(self.document)
@@ -91,3 +91,16 @@ def test_create_pipeline_from_document_instantiates_a_document_id(self, fake_cal
doc = Document.objects.all()[0]
create_pipeline_from_document(doc)
fake_call_default_pipeline.assert_called_with(ObjectId(doc.blob.name))
+
+
+class CorpusFreqDistTest(TestWithMongo):
+ fixtures = ['users', 'corpora', 'documents']
+
+ @patch('pypln.web.backend_adapter.pipelines.CorpusFreqDist', autospec=True)
+ def test_should_call_CorpusFreqDist_with_document_ids(self,
+ corpus_freqdist_worker):
+ corpus = Corpus.objects.get(pk=2)
+ ids = [ObjectId("562526d9798ebd4616b23bb1")]
+ calculate_corpus_freqdist(corpus)
+ corpus_freqdist_worker.assert_called_with()
+ corpus_freqdist_worker.return_value.delay.assert_called_with(corpus.pk, ids)
diff --git a/pypln/web/core/fixtures/mongodb/corpora_analysis.json b/pypln/web/core/fixtures/mongodb/corpora_analysis.json
new file mode 100644
index 0000000..ff77cb0
--- /dev/null
+++ b/pypln/web/core/fixtures/mongodb/corpora_analysis.json
@@ -0,0 +1,16 @@
+[
+ {
+ "_id" : { "$oid": "5785005257bc3a1070d8cdbf" },
+ "corpus_id" : 2,
+ "freqdist" : [
+ [ "á", 1 ],
+ [ "non-ascii", 1 ],
+ [ ".", 1 ],
+ [ "char", 1 ],
+ [ "file", 1 ],
+ [ "test", 1 ],
+ [ ":", 1 ],
+ [ "with", 1 ]
+ ]
+ }
+]
diff --git a/pypln/web/core/models.py b/pypln/web/core/models.py
index df4e177..6808d12 100644
--- a/pypln/web/core/models.py
+++ b/pypln/web/core/models.py
@@ -21,6 +21,7 @@
from django.contrib.auth.models import User
from django.dispatch import receiver
from django.db import models
+import pymongo
from rest_framework.reverse import reverse
from rest_framework.authtoken.models import Token
@@ -28,6 +29,7 @@
from pypln.web.core.storage import MongoDBBase64Storage
mongodb_storage = MongoDBBase64Storage()
+corpus_collection = pymongo.Connection(host=settings.MONGODB_URIS)[settings.MONGODB_DBNAME][settings.MONGODB_CORPORA_COLLECTION]
class Corpus(models.Model):
@@ -43,6 +45,13 @@ class Meta:
def __unicode__(self):
return self.name
+ @property
+ def properties(self):
+ corpus_analysis = corpus_collection.find_one({"corpus_id": self.id})
+ if corpus_analysis is None:
+ return {}
+ return corpus_analysis
+
class Document(models.Model):
blob = models.FileField(upload_to='/', storage=mongodb_storage)
diff --git a/pypln/web/core/storage.py b/pypln/web/core/storage.py
index abd5130..f4103d1 100644
--- a/pypln/web/core/storage.py
+++ b/pypln/web/core/storage.py
@@ -28,7 +28,6 @@
from django.conf import settings
from django.utils.encoding import filepath_to_uri
from pymongo import Connection
-from gridfs import GridFS, NoFile
class MongoDBBase64Storage(Storage):
diff --git a/pypln/web/core/tests/test_models.py b/pypln/web/core/tests/test_models.py
index 753be0d..8f039fe 100644
--- a/pypln/web/core/tests/test_models.py
+++ b/pypln/web/core/tests/test_models.py
@@ -27,7 +27,7 @@
from pypln.web.core.models import Corpus, Document
from pypln.web.core.tests.utils import TestWithMongo
-__all__ = ["CorpusModelTest", "DocumentModelTest"]
+__all__ = ["CorpusModelTest", "CorpusPropertiesTest", "DocumentModelTest"]
class CorpusModelTest(TestCase):
fixtures = ['users']
@@ -46,6 +46,25 @@ def test_different_users_can_have_corpora_with_the_same_name(self):
self.assertEqual(corpus_1.name, corpus_2.name)
+class CorpusPropertiesTest(TestWithMongo):
+ fixtures = ['users', 'corpora', 'corpora_analysis']
+
+ def test_returns_keyerror_when_key_does_not_exist(self):
+ expected_data = u'Test file with non-ascii char: á.'
+ corpus = Corpus.objects.all()[0]
+ with self.assertRaises(KeyError):
+ corpus.properties['analysis_that_does_not_exist']
+
+ def test_get_freqdist_from_store(self):
+ expected_data = [
+ [u"á", 1], [u"non-ascii", 1], [u".", 1],
+ [u"char", 1], [u"file", 1], [u"test", 1], [u":", 1],
+ [u"with", 1 ]
+ ]
+ corpus = Corpus.objects.get(pk=2)
+ self.assertEqual(corpus.properties['freqdist'], expected_data)
+
+
class DocumentModelTest(TestWithMongo):
fixtures = ['users', 'corpora', 'documents']
diff --git a/pypln/web/core/tests/utils.py b/pypln/web/core/tests/utils.py
index 4ed906f..43fb1d2 100644
--- a/pypln/web/core/tests/utils.py
+++ b/pypln/web/core/tests/utils.py
@@ -42,6 +42,13 @@ def _pre_setup(self, *args, **kwargs):
mongodb_storage.save(os.path.basename(doc.blob.name),
StringIO(u"Test file with non-ascii char: á.".encode('utf-8')))
+ if hasattr(self, 'fixtures') and self.fixtures is not None and 'corpora_analysis' in self.fixtures:
+ filename = os.path.join(settings.PROJECT_ROOT, 'core/fixtures/mongodb/corpora_analysis.json')
+ with open(filename, 'r') as mongo_fixture:
+ for obj in json_util.loads(mongo_fixture.read()):
+ mongodb_storage._connection[settings.MONGODB_DBNAME][settings.MONGODB_CORPORA_COLLECTION].insert(obj, w=1)
+
+
def _post_teardown(self, *args, **kwargs):
mongodb_storage._connection.drop_database(mongodb_storage._db.name)
super(TestWithMongo, self)._post_teardown(*args, **kwargs)
diff --git a/pypln/web/core/tests/views/test_corpus_analysis.py b/pypln/web/core/tests/views/test_corpus_analysis.py
new file mode 100644
index 0000000..dcc85de
--- /dev/null
+++ b/pypln/web/core/tests/views/test_corpus_analysis.py
@@ -0,0 +1,106 @@
+# -*- coding:utf-8 -*-
+#
+# Copyright 2012 NAMD-EMAP-FGV
+#
+# This file is part of PyPLN. You can get more information at: http://pypln.org/.
+#
+# PyPLN is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# PyPLN is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with PyPLN. If not, see .
+import json
+
+from django.contrib.auth.models import User
+from django.core.urlresolvers import reverse
+from mock import patch
+
+from pypln.web.core.models import Corpus, User
+from pypln.web.core.tests.utils import TestWithMongo
+
+__all__ = ["CorpusFreqDistViewTest"]
+
+
+class CorpusFreqDistViewTest(TestWithMongo):
+ fixtures = ['users', 'corpora', 'documents', 'corpora_analysis']
+
+ def test_requires_login(self):
+ response = self.client.get(reverse('corpus-freqdist',
+ kwargs={'pk': 2}))
+ self.assertEqual(response.status_code, 403)
+
+ def test_returns_404_for_inexistent_corpus(self):
+ self.client.login(username="user", password="user")
+ response = self.client.get(reverse('corpus-freqdist',
+ kwargs={'pk': 9999}))
+ self.assertEqual(response.status_code, 404)
+
+ def test_returns_404_if_user_is_not_the_owner_of_the_corpus(self):
+ self.client.login(username="user", password="user")
+ corpus = Corpus.objects.filter(owner__username="admin")[0]
+ response = self.client.get(reverse('corpus-freqdist',
+ kwargs={'pk': corpus.id}))
+ self.assertEqual(response.status_code, 404)
+
+ def test_returns_404_if_corpus_has_no_freqdist_yet(self):
+ self.client.login(username="admin", password="admin")
+ corpus = Corpus.objects.filter(owner__username="admin")[0]
+ response = self.client.get(reverse('corpus-freqdist',
+ kwargs={'pk': corpus.id}))
+ self.assertEqual(response.status_code, 404)
+
+ def test_shows_corpus_freqdist_correctly(self):
+ self.client.login(username="user", password="user")
+ corpus = Corpus.objects.filter(owner__username="user")[0]
+ response = self.client.get(reverse('corpus-freqdist',
+ kwargs={'pk': corpus.id}))
+
+ self.assertEqual(response.status_code, 200)
+ self.assertEqual(response.renderer_context['view'].get_object(),
+ corpus)
+ expected_data = corpus.properties['freqdist']
+ self.assertEqual(response.data['value'], expected_data)
+
+ @patch('pypln.web.core.views.calculate_corpus_freqdist')
+ def test_queue_freqdist_analysis_for_a_corpus_that_still_does_not_have_one(self,
+ calculate_corpus_freqdist):
+ """
+ This is a regression test. There used to be a bug that returned 404
+ before queueing the analysis if the corpus didn't have a freqdist
+ analysis yet.
+ """
+ self.user = User.objects.get(username="admin")
+ self.client.login(username="admin", password="admin")
+
+ corpus = self.user.corpus_set.all()[0]
+ response = self.client.put(reverse('corpus-freqdist',
+ kwargs={"pk": corpus.id}))
+
+ self.assertFalse(corpus.properties.has_key("freqdist"))
+
+ self.assertEqual(response.status_code, 200)
+ self.assertTrue(calculate_corpus_freqdist.called)
+ calculate_corpus_freqdist.assert_called_with(corpus)
+
+ @patch('pypln.web.core.views.calculate_corpus_freqdist')
+ def test_queue_freqdist_analysis_for_a_corpus_that_has_one(self,
+ calculate_corpus_freqdist):
+ self.user = User.objects.get(username="user")
+ self.client.login(username="user", password="user")
+
+ corpus = self.user.corpus_set.all()[0]
+ response = self.client.put(reverse('corpus-freqdist',
+ kwargs={"pk": corpus.id}))
+
+ self.assertTrue(corpus.properties.has_key("freqdist"))
+
+ self.assertEqual(response.status_code, 200)
+ self.assertTrue(calculate_corpus_freqdist.called)
+ calculate_corpus_freqdist.assert_called_with(corpus)
diff --git a/pypln/web/core/urls.py b/pypln/web/core/urls.py
index 77206cd..c893fa8 100644
--- a/pypln/web/core/urls.py
+++ b/pypln/web/core/urls.py
@@ -20,6 +20,7 @@
from django.conf.urls import patterns, url, include
from rest_framework.urlpatterns import format_suffix_patterns
from pypln.web.core.views import CorpusList, CorpusDetail, CorpusDocumentList
+from pypln.web.core.views import CorpusFreqDist
from pypln.web.core.views import DocumentList, DocumentDetail
from pypln.web.core.views import PropertyList, PropertyDetail
@@ -28,6 +29,8 @@
url(r'^user/api-token/$', 'auth_token', name='auth_token'),
url(r'^corpora/$', CorpusList.as_view(), name='corpus-list'),
url(r'^corpora/(?P\d+)/$', CorpusDetail.as_view(), name='corpus-detail'),
+ url(r'^corpora/(?P\d+)/freqdist/$', CorpusFreqDist.as_view(),
+ name='corpus-freqdist'),
url(r'^corpora/(?P\d+)/documents/$', CorpusDocumentList.as_view(),
name='corpus-document-list'),
url(r'^documents/$', DocumentList.as_view(), name='document-list'),
diff --git a/pypln/web/core/views.py b/pypln/web/core/views.py
index fc10cd9..f4b0600 100644
--- a/pypln/web/core/views.py
+++ b/pypln/web/core/views.py
@@ -28,7 +28,8 @@
from rest_framework.response import Response
from rest_framework import serializers
-from pypln.web.backend_adapter.pipelines import create_pipeline_from_document
+from pypln.web.backend_adapter.pipelines import (create_pipeline_from_document,
+ calculate_corpus_freqdist)
from pypln.web.core.models import Corpus, Document
from pypln.web.core.serializers import CorpusSerializer, DocumentSerializer
from pypln.web.core.serializers import PropertyListSerializer
@@ -116,6 +117,38 @@ def get_queryset(self):
def perform_update(self, serializer):
instance = serializer.save(owner=self.request.user)
+class CorpusFreqDist(generics.RetrieveUpdateAPIView):
+ """
+ Shows FreqDist for the corpus
+
+ `GET` requests will show the last calculated FreqDist for the corpus
+
+ `PUT` requests will queue a new task for calculating the Corpus
+ FreqDist using the documents currently contained in the corpus
+
+ """
+ model = Corpus
+ permission_classes = (permissions.IsAuthenticated, )
+
+ class CorpusFreqDistSerializer(serializers.Serializer):
+ value = serializers.ReadOnlyField(source="properties.freqdist")
+
+ serializer_class = CorpusFreqDistSerializer
+
+ def get_queryset(self):
+ return Corpus.objects.filter(owner=self.request.user)
+
+ def retrieve(self, *args, **kwargs):
+ corpus = self.get_object()
+ if corpus.properties.has_key("freqdist"):
+ return super(CorpusFreqDist, self).retrieve(self, *args, **kwargs)
+ else:
+ raise Http404("FreqDist for Corpus {} is not yet available".format(corpus))
+
+ def perform_update(self, serializer):
+ calculate_corpus_freqdist(serializer.instance)
+
+
class DocumentList(generics.ListCreateAPIView):
"""
Lists all documents available to the current user and creates new documents.
diff --git a/pypln/web/settings.py b/pypln/web/settings.py
index 9c36239..36152b6 100644
--- a/pypln/web/settings.py
+++ b/pypln/web/settings.py
@@ -58,6 +58,7 @@ def split_uris(uri):
MONGODB_DBNAME = config('MONGODB_DBNAME', default='pypln')
MONGODB_COLLECTION = config('MONGODB_COLLECTION', default='analysis')
+MONGODB_CORPORA_COLLECTION = config('MONGODB_CORPORA_COLLECTION', default='corpora_analysis')
ALLOWED_HOSTS = config('ALLOWED_HOSTS', cast=Csv())