(.*))/$', views.document_detail, name='document_detail')
+]
diff --git a/api/webview/views.py b/api/webview/views.py
new file mode 100644
index 00000000..20b6f1a0
--- /dev/null
+++ b/api/webview/views.py
@@ -0,0 +1,58 @@
+from rest_framework import generics
+from rest_framework.response import Response
+from rest_framework.decorators import api_view
+from django.views.decorators.clickjacking import xframe_options_exempt
+
+from api.webview.models import Document
+from api.webview.serializers import DocumentSerializer
+
+
+class DocumentList(generics.ListCreateAPIView):
+ """
+ List all documents in the SHARE API
+ """
+ serializer_class = DocumentSerializer
+
+ def perform_create(self, serializer):
+ serializer.save(source=self.request.user)
+
+ def get_queryset(self):
+ """ Return all documents
+ """
+ queryset = Document.objects.all()
+
+ return queryset
+
+
+class DocumentsFromSource(generics.ListCreateAPIView):
+ """
+ List all documents from a particular source
+ """
+ serializer_class = DocumentSerializer
+
+ def perform_create(self, serializer):
+ serializer.save(source=self.request.user)
+
+ def get_queryset(self):
+ """ Return queryset based on source
+ """
+ queryset = Document.objects.filter(source=self.kwargs['source'])
+
+ return queryset
+
+
+@api_view(['GET'])
+@xframe_options_exempt
+def document_detail(request, source, docID):
+ """
+ Retrieve one particular document.
+ """
+ try:
+ all_sources = Document.objects.filter(source=source)
+ document = all_sources.get(docID=docID)
+ except Document.DoesNotExist:
+ return Response(status=404)
+
+ if request.method == 'GET':
+ serializer = DocumentSerializer(document)
+ return Response(serializer.data)
diff --git a/dev-requirements.txt b/dev-requirements.txt
index 9a1410a5..b43600a3 100644
--- a/dev-requirements.txt
+++ b/dev-requirements.txt
@@ -4,6 +4,7 @@ httpretty==0.8.4
pytest-cov==1.8.1
ipdb==0.8
ipython==3.1.0
+django-pytest==0.2.0
pep8>=1.5.7,<1.6.0
pyflakes>=0.8,<0.9
diff --git a/manage.py b/manage.py
new file mode 100755
index 00000000..b100cc2d
--- /dev/null
+++ b/manage.py
@@ -0,0 +1,10 @@
+#!/usr/bin/env python
+import os
+import sys
+
+if __name__ == "__main__":
+ os.environ.setdefault("DJANGO_SETTINGS_MODULE", "api.api.settings")
+
+ from django.core.management import execute_from_command_line
+
+ execute_from_command_line(sys.argv)
diff --git a/requirements.txt b/requirements.txt
index 170db5f1..90866344 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,5 +16,13 @@ furl==0.4.4
jsonschema==2.4.0
jsonpointer==1.7
pycountry==1.10
+<<<<<<< HEAD
+djangorestframework==3.1.3
+Django==1.8.2
+django-pgjson==0.3.1
+django-cors-headers==1.1.0
+psycopg2==2.6.1
+=======
rfc3987==1.3.4
strict-rfc3339==0.5
+>>>>>>> c503c4194f6aea2dfaa75a17865512ee9d701a75
diff --git a/scrapi/base/__init__.py b/scrapi/base/__init__.py
index 11b5f9ab..d8610c85 100644
--- a/scrapi/base/__init__.py
+++ b/scrapi/base/__init__.py
@@ -81,7 +81,8 @@ class JSONHarvester(BaseHarvester, JSONTransformer):
def normalize(self, raw_doc):
transformed = self.transform(json.loads(raw_doc['doc']), fail=settings.RAISE_IN_TRANSFORMER)
transformed['shareProperties'] = {
- 'source': self.short_name
+ 'source': self.short_name,
+ 'docID': raw_doc['docID']
}
return NormalizedDocument(transformed, clean=True)
@@ -92,7 +93,8 @@ class XMLHarvester(BaseHarvester, XMLTransformer):
def normalize(self, raw_doc):
transformed = self.transform(etree.XML(raw_doc['doc']), fail=settings.RAISE_IN_TRANSFORMER)
transformed['shareProperties'] = {
- 'source': self.short_name
+ 'source': self.short_name,
+ 'docID': raw_doc['docID']
}
return NormalizedDocument(transformed, clean=True)
diff --git a/scrapi/events.py b/scrapi/events.py
index f6fc8eca..db943b83 100644
--- a/scrapi/events.py
+++ b/scrapi/events.py
@@ -22,6 +22,7 @@
HARVESTER_RUN = 'runHarvester'
CHECK_ARCHIVE = 'checkArchive'
NORMALIZATION = 'normalization'
+PROCESSSING_URIS = 'processingUris'
# statuses
FAILED = 'failed'
diff --git a/scrapi/processing/__init__.py b/scrapi/processing/__init__.py
index b08de00c..7a1918dd 100644
--- a/scrapi/processing/__init__.py
+++ b/scrapi/processing/__init__.py
@@ -34,3 +34,9 @@ def process_raw(raw_doc, kwargs):
for p in settings.RAW_PROCESSING:
extras = kwargs.get(p, {})
get_processor(p).process_raw(raw_doc, **extras)
+
+
+def process_uris(source, docID, uri, uritype, kwargs):
+ for p in settings.POST_PROCESSING:
+ extras = kwargs.get(p, {})
+ get_processor(p).process_uris(source, docID, uri, uritype, **extras)
diff --git a/scrapi/processing/base.py b/scrapi/processing/base.py
index 155736e7..87bcb869 100644
--- a/scrapi/processing/base.py
+++ b/scrapi/processing/base.py
@@ -6,3 +6,6 @@ def process_raw(self, raw_doc, **kwargs):
def process_normalized(self, raw_doc, normalized, **kwargs):
pass # pragma: no cover
+
+ def process_uris(self, source, docID, uri, uritype, **kwargs):
+ pass # pragma: no cover
diff --git a/scrapi/processing/postgres.py b/scrapi/processing/postgres.py
new file mode 100644
index 00000000..d6c24e69
--- /dev/null
+++ b/scrapi/processing/postgres.py
@@ -0,0 +1,46 @@
+from __future__ import absolute_import
+
+import os
+import copy
+import logging
+
+from scrapi import events
+from scrapi.processing.base import BaseProcessor
+
+os.environ.setdefault("DJANGO_SETTINGS_MODULE", "api.api.settings")
+from api.webview.models import Document
+
+logger = logging.getLogger(__name__)
+
+
+class PostgresProcessor(BaseProcessor):
+ NAME = 'postgres'
+
+ @events.logged(events.PROCESSING, 'raw.postgres')
+ def process_raw(self, raw_doc):
+ source, docID = raw_doc['source'], raw_doc['docID']
+ document = self._get_by_source_id(Document, source, docID) or Document(source=source, docID=docID)
+
+ modified_doc = copy.deepcopy(raw_doc.attributes)
+ if modified_doc.get('versions'):
+ modified_doc['versions'] = map(str, modified_doc['versions'])
+
+ document.raw = modified_doc
+
+ document.save()
+
+ @events.logged(events.PROCESSING, 'normalized.postgres')
+ def process_normalized(self, raw_doc, normalized):
+ source, docID = raw_doc['source'], raw_doc['docID']
+ document = self._get_by_source_id(Document, source, docID) or Document(source=source, docID=docID)
+
+ document.normalized = normalized.attributes
+ document.providerUpdatedDateTime = normalized['providerUpdatedDateTime']
+
+ document.save()
+
+ def _get_by_source_id(self, model, source, docID):
+ try:
+ return Document.objects.filter(source=source, docID=docID)[0]
+ except IndexError:
+ return None
diff --git a/scrapi/processing/uri_logging.py b/scrapi/processing/uri_logging.py
new file mode 100644
index 00000000..bbc10aa9
--- /dev/null
+++ b/scrapi/processing/uri_logging.py
@@ -0,0 +1,59 @@
+from __future__ import absolute_import
+
+import os
+import datetime
+import requests
+import logging
+# from furl import furl
+
+# from scrapi.processing import scrapers
+from scrapi.processing.base import BaseProcessor
+
+os.environ.setdefault("DJANGO_SETTINGS_MODULE", "api.api.settings")
+from api.webview.models import Document
+
+
+logger = logging.getLogger(__name__)
+
+
+class UriProcessor(BaseProcessor):
+ NAME = 'uri_logging'
+
+ def process_uris(self, source, docID, uri, uritype, **kwargs):
+ try:
+ document = Document.objects.get(source=source, docID=docID)
+ processed_normalized = self.save_status_of_uri(document.normalized, uri, uritype)
+
+ document.normalized = processed_normalized
+
+ document.save()
+ except TypeError:
+ pass
+
+ def save_status_of_uri(self, normalized, uri, uritype):
+ uri_status = requests.get(uri)
+
+ status = {
+ 'actual_uri': uri,
+ 'uritype': uritype,
+ 'resolved_uri': uri_status.url,
+ 'resolved_datetime': datetime.datetime.now(),
+ 'resolved_status': uri_status.status_code,
+ 'is_doi': True if 'dx.doi.org' in normalized['uris']['canonicalUri'] else False
+ }
+
+ try:
+ normalized['shareProperties']['uri_logs']['status'].append(status)
+ except KeyError:
+ normalized['shareProperties']['uri_logs'] = {}
+ normalized['shareProperties']['uri_logs']['status'] = [status]
+
+ # extra_info = scrapers.collect_scraped(uri)
+
+ # if extra_info:
+ # try:
+ # normalized['shareProperties']['scraped_properties'].append(extra_info)
+ # except KeyError:
+ # normalized['shareProperties']['scraped_properties'] = [extra_info]
+
+ return normalized
diff --git a/scrapi/scrapers/__init__.py b/scrapi/scrapers/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/scrapi/scrapers/scrapers.py b/scrapi/scrapers/scrapers.py
new file mode 100644
index 00000000..1fe307a1
--- /dev/null
+++ b/scrapi/scrapers/scrapers.py
@@ -0,0 +1,95 @@
+from furl import furl
+from bs4 import BeautifulSoup
+
+from scrapi.scrapers import utils
+
+
+def collect_scraped(uri):
+
+ base = furl(uri).host.replace('www.', '')
+ if base == 'sciencedirect.com':
+ info = science_direct(uri)
+ elif base == 'link.springer.com':
+ info = springer_link(uri)
+ else:
+ info = {}
+
+ return info
+
+
+# Science Direct
+
+
+def science_direct(uri):
+ '''
+ Future potential
+
+
+ '''
+
+ return parse_sd_author_list(uri)
+
+
+def parse_sd_author_list(uri):
+
+ soup = BeautifulSoup(utils.get_html_from_link(uri), "lxml")
+
+ auth_affil = soup.find_all("ul", class_="authorGroup")
+ lis = [item.find_all('li') for item in auth_affil][0]
+ theas = [thing.find_all('a') for thing in lis]
+
+ full_author_info = []
+ for author_pair in theas:
+ author_info = {}
+ for author_part in author_pair:
+ author_info = utils.merge_dicts(author_part.attrs, author_info)
+ author_info['matcher'] = author_info['href'][0]
+
+ full_author_info.append(author_info)
+
+ # Get affiliations and put them with the original author dict
+ outer_affilations = soup.find_all("ul", class_="authAffil")
+ affiliations = [item.find_all('li') for item in outer_affilations]
+
+ affils = []
+ for person in affiliations:
+ for result in person:
+ d = result.attrs
+ d['institution'] = result.text
+ d['matcher'] = d['id']
+ affils.append(d)
+
+ all_authors = []
+ for author in full_author_info:
+ del author['data-pos']
+ del author['data-t']
+ del author['data-tb']
+
+ for affil in affils:
+ if author['matcher'].replace('#', '') == affil['matcher']:
+ combined = author.copy()
+ combined.update(affil)
+ all_authors.append(combined)
+ del author['matcher']
+
+ return all_authors
+
+
+# Springer Link
+
+def springer_link(uri):
+ element = utils.get_elements_from_link(uri)
+ return {'open_access': get_springer_open_access(element)}
+
+
+def get_springer_open_access(element):
+ links = element.xpath('//a')
+ words = []
+ for link in links:
+ if 'viewtype' in link.keys():
+ if 'webtrekk-track' in link.get('class'):
+ words.append(link.get('viewtype'))
+ if 'Denial' in words:
+ return False
+ else:
+ return True
diff --git a/scrapi/scrapers/utils.py b/scrapi/scrapers/utils.py
new file mode 100644
index 00000000..f02c8699
--- /dev/null
+++ b/scrapi/scrapers/utils.py
@@ -0,0 +1,26 @@
+import requests
+from lxml import etree
+
+
+def merge_dicts(*dicts):
+ d = {}
+ for dict in dicts:
+ for key in dict:
+ try:
+ d[key].append(dict[key])
+ except KeyError:
+ d[key] = [dict[key]]
+ for key in d:
+ if len(d[key]) == 1:
+ d[key] = d[key][0]
+
+ return d
+
+
+def get_elements_from_link(link):
+ content = requests.get(link).content
+ return etree.HTML(content)
+
+
+def get_html_from_link(link):
+ return requests.get(link).content
diff --git a/scrapi/settings/local-dist.py b/scrapi/settings/local-dist.py
index 826d3528..5d6ada4a 100644
--- a/scrapi/settings/local-dist.py
+++ b/scrapi/settings/local-dist.py
@@ -12,6 +12,7 @@
NORMALIZED_PROCESSING = []
RAW_PROCESSING = []
+POST_PROCESSING = []
SENTRY_DSN = None
diff --git a/scrapi/tasks.py b/scrapi/tasks.py
index c196d84e..948605e2 100644
--- a/scrapi/tasks.py
+++ b/scrapi/tasks.py
@@ -1,4 +1,5 @@
import logging
+import json
import functools
from itertools import islice
from datetime import date, timedelta
@@ -125,6 +126,49 @@ def process_normalized(normalized_doc, raw_doc, **kwargs):
processing.process_normalized(raw_doc, normalized_doc, kwargs)
+@task_autoretry(default_retry_delay=settings.CELERY_RETRY_DELAY, max_retries=0)
+@events.logged(events.PROCESSSING_URIS, 'uri_processing')
+def process_uris(async, **kwargs):
+ settings.CELERY_ALWAYS_EAGER = not async
+
+ all_buckets = []
+ if kwargs.get('source'):
+ source_buckets = util.parse_urls_into_groups(kwargs['source'])
+ all_buckets.append(source_buckets)
+ else:
+ for source in registry.keys():
+ source_buckets = util.parse_urls_into_groups(source)
+ all_buckets.append(source_buckets)
+
+ with open('all_sources.json', 'w') as outfile:
+ json.dump(all_buckets, outfile)
+
+ # for source_dict in all_buckets:
+ # for group in source_dict['uris']:
+ # process_uris_at_one_base_uri.delay(group['individual_uris'], async, kwargs=kwargs)
+
+
+@task_autoretry(default_retry_delay=settings.CELERY_RETRY_DELAY, max_retries=0)
+@events.logged(events.PROCESSSING_URIS, 'uri_processing')
+def process_uris_at_one_base_uri(uri_list, async=False, **kwargs):
+ settings.CELERY_ALWAYS_EAGER = not async
+
+ for uri in uri_list:
+ process_one_uri.delay(uri, kwargs=kwargs)
+
+
+@task_autoretry(default_retry_delay=settings.CELERY_RETRY_DELAY, max_retries=0, rate_limit='5/s')
+@events.logged(events.PROCESSSING_URIS, 'uri_processing')
+def process_one_uri(uri, **kwargs):
+ processing.process_uris(
+ source=uri['source'],
+ docID=uri['docID'],
+ uri=uri['uri'],
+ uritype=uri['uritype'],
+ kwargs=kwargs
+ )
+
+
@app.task
def migrate(migration, sources=tuple(), async=False, dry=True, group_size=1000, **kwargs):
from scrapi.migrations import documents
diff --git a/scrapi/util.py b/scrapi/util.py
index 0116141d..86bf0b08 100644
--- a/scrapi/util.py
+++ b/scrapi/util.py
@@ -1,8 +1,15 @@
from datetime import datetime
+import os
+import re
import six
import pytz
+os.environ.setdefault("DJANGO_SETTINGS_MODULE", "api.api.settings")
+from api.webview.models import Document
+
+URL_RE = re.compile(r'(https?:\/\/[^\/]*)')
+
def timestamp():
return pytz.utc.localize(datetime.utcnow()).isoformat()
@@ -51,3 +58,63 @@ def json_without_bytes(jobj):
if isinstance(v, six.binary_type):
jobj[k] = v.decode('utf8')
return jobj
+
+
+def parse_urls_into_groups(source):
+
+ source_dict = {'source': source, 'uris': [], 'all_bases': []}
+ for document in Document.objects.filter(source=source):
+ if document.normalized:
+ docID = document.normalized['shareProperties']['docID']
+
+ source_dict = uri_processing(
+ document.normalized['uris']['canonicalUri'],
+ source,
+ docID,
+ source_dict,
+ 'cannonicalUri'
+ )
+
+ if document.normalized['uris'].get('providerUris'):
+ for uri in document.normalized['uris']['providerUris']:
+ source_dict = uri_processing(uri, source, docID, source_dict, 'providerUris')
+ if document.normalized['uris'].get('descriptorUris'):
+ for uri in document.normalized['uris']['descriptorUris']:
+ source_dict = uri_processing(uri, source, docID, source_dict, 'descriptorUris')
+ if document.normalized['uris'].get('objectUris'):
+ for uri in document.normalized['uris']['objectUris']:
+ if uri:
+ if isinstance(uri, list):
+ for element in uri:
+ source_dict = uri_processing(element, source, docID, source_dict, 'objectUris')
+ else:
+ source_dict = uri_processing(uri, source, docID, source_dict, 'objectUris')
+
+ return source_dict
+
+
+def uri_processing(uri, source, docID, source_dict, uritype):
+ base_uri = URL_RE.search(uri).group()
+
+ if base_uri in source_dict['all_bases']:
+ for entry in source_dict['uris']:
+ if base_uri == entry['base_uri']:
+ entry['individual_uris'].append({
+ 'uri': uri,
+ 'source': source,
+ 'docID': docID,
+ 'uritype': uritype
+ })
+ else:
+ source_dict['uris'].append({
+ 'base_uri': base_uri,
+ 'individual_uris': [{
+ 'uri': uri,
+ 'source': source,
+ 'docID': docID,
+ 'uritype': uritype
+ }]
+ })
+ source_dict['all_bases'].append(base_uri)
+
+ return source_dict
diff --git a/tasks.py b/tasks.py
index e4adbc1a..3dba470e 100644
--- a/tasks.py
+++ b/tasks.py
@@ -1,3 +1,4 @@
+import os
import base64
import logging
import platform
@@ -13,19 +14,20 @@
from scrapi import registry
from scrapi import settings
-from scrapi.processing.elasticsearch import es
logger = logging.getLogger()
@task
def reindex(src, dest):
+ from scrapi.processing.elasticsearch import es
helpers.reindex(es, src, dest)
es.indices.delete(src)
@task
def alias(alias, index):
+ from scrapi.processing.elasticsearch import es
es.indices.delete_alias(index=alias, name='_all', ignore=404)
es.indices.put_alias(alias, index)
@@ -122,7 +124,7 @@ def test(cov=True, verbose=False, debug=False):
if debug:
cmd += ' -s'
if cov:
- cmd += ' --cov-report term-missing --cov-config .coveragerc --cov scrapi'
+ cmd += ' --cov-report term-missing --cov-config .coveragerc --cov scrapi --cov api'
run(cmd, pty=True)
@@ -185,6 +187,14 @@ def harvesters(async=False, start=None, end=None):
logger.exception(e)
+@task
+def process_uris(async=False, source=None):
+ settings.CELERY_ALWAYS_EAGER = not async
+ from scrapi.tasks import process_uris
+
+ process_uris.delay(async=async, source=source)
+
+
@task
def lint_all():
for name in registry.keys():
@@ -224,3 +234,25 @@ def provider_map(delete=False):
refresh=True
)
print(es.count('share_providers', body={'query': {'match_all': {}}})['count'])
+
+
+@task
+def apiserver():
+ os.system('python manage.py runserver')
+
+
+@task
+def reset_all():
+ try:
+ input = raw_input
+ except Exception:
+ pass
+ if input('Are you sure? y/N ') != 'y':
+ return
+ os.system('psql -c "DROP DATABASE scrapi;"')
+ os.system('psql -c "CREATE DATABASE scrapi;"')
+ os.system('python manage.py migrate')
+
+ os.system("curl -XDELETE '{}/share*'".format(settings.ELASTIC_URI))
+ os.system("invoke alias share share_v2")
+ os.system("invoke provider_map")
diff --git a/tests/test_api_views.py b/tests/test_api_views.py
new file mode 100644
index 00000000..61b84d53
--- /dev/null
+++ b/tests/test_api_views.py
@@ -0,0 +1,43 @@
+import os
+os.environ.setdefault("DJANGO_SETTINGS_MODULE", "api.api.settings")
+
+import django
+from django.test import TestCase
+from rest_framework.test import APIRequestFactory
+
+from api.webview.views import DocumentList
+
+django.setup()
+
+
+class APIViewTests(TestCase):
+
+ def setUp(self):
+ self.factory = APIRequestFactory()
+
+ def test_document_view(self):
+ view = DocumentList.as_view()
+ request = self.factory.get(
+ '/documents/'
+ )
+ response = view(request)
+
+ self.assertEqual(response.status_code, 200)
+
+ def test_source_view(self):
+ view = DocumentList.as_view()
+ request = self.factory.get(
+ '/documents/dudley_weekly/'
+ )
+ response = view(request)
+
+ self.assertEqual(response.status_code, 200)
+
+ def test_individual_view(self):
+ view = DocumentList.as_view()
+ request = self.factory.get(
+ '/documents/dudley_weekly/dudley1'
+ )
+ response = view(request)
+
+ self.assertEqual(response.status_code, 200)
diff --git a/tests/test_json_harvester.py b/tests/test_json_harvester.py
index b54e4b19..8960f734 100644
--- a/tests/test_json_harvester.py
+++ b/tests/test_json_harvester.py
@@ -33,7 +33,8 @@
},
"providerUpdatedDateTime": "2015-02-02T00:00:00+00:00",
"shareProperties": {
- "source": "test"
+ "source": "test",
+ "docID": "1"
},
"otherProperties": [
{
diff --git a/tests/test_migrations.py b/tests/test_migrations.py
index b7fecbd1..5807f2d7 100644
--- a/tests/test_migrations.py
+++ b/tests/test_migrations.py
@@ -12,14 +12,15 @@
from scrapi.migrations import rename
from scrapi.migrations import renormalize
from scrapi.migrations import DocumentModelOld
-from scrapi.migrations import document_v2_migration
# Need to force cassandra to ignore set keyspace
from scrapi.processing.cassandra import CassandraProcessor, DocumentModel
+from scrapi.processing.postgres import PostgresProcessor
from . import utils
test_cass = CassandraProcessor()
+test_postgres = PostgresProcessor()
test_harvester = utils.TestHarvester()
diff --git a/tests/test_postgres_processor.py b/tests/test_postgres_processor.py
new file mode 100644
index 00000000..841db7ed
--- /dev/null
+++ b/tests/test_postgres_processor.py
@@ -0,0 +1,31 @@
+# import pytest
+
+import os
+os.environ.setdefault("DJANGO_SETTINGS_MODULE", "api.api.settings")
+
+import django
+from django.test import TestCase
+from scrapi.processing.postgres import PostgresProcessor, Document
+
+from . import utils
+from scrapi.linter.document import RawDocument, NormalizedDocument
+
+django.setup()
+
+test_db = PostgresProcessor()
+
+RAW = RawDocument(utils.POSTGRES_RAW_DOC)
+NORMALIZED = NormalizedDocument(utils.RECORD)
+
+
+class DocumentTestCase(TestCase):
+
+ def test_raw_processing(self):
+ test_db.process_raw(RAW)
+ queryset = Document(docID='someID', source=RAW['source'])
+ assert queryset.docID == RAW.attributes['docID']
+
+ def test_normalized_processing(self):
+ test_db.process_normalized(RAW, NORMALIZED)
+ queryset = Document(docID=RAW['docID'], source=RAW['source'])
+ assert(queryset.source == NORMALIZED['shareProperties']['source'])
diff --git a/tests/utils.py b/tests/utils.py
index 770812c6..a9d13d0b 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -21,6 +21,18 @@
'source': 'test'
}
+POSTGRES_RAW_DOC = {
+ 'doc': '{}',
+ 'docID': 'someID',
+ 'timestamps': {
+ 'harvestFinished': '2012-11-30T17:05:48+00:00',
+ 'harvestStarted': '2012-11-30T17:05:48+00:00',
+ 'harvestTaskCreated': '2012-11-30T17:05:48+00:00'
+ },
+ 'filetype': 'json',
+ 'source': 'test'
+}
+
NORMALIZED_DOC = {
'title': 'No',
'contributors': [{'name': ''}],