diff --git a/.gitignore b/.gitignore index e4109c0a..278ea6e4 100644 --- a/.gitignore +++ b/.gitignore @@ -58,7 +58,7 @@ archive/* .DS_Store worker_manager/recent_files.txt -scrapi/settings/local.py +**/settings/local.py celerybeat-schedule.db records/ archive/* diff --git a/.travis.yml b/.travis.yml index ea91f762..5c030456 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,13 +7,21 @@ python: services: - cassandra - elasticsearch + - postgres + +addons: + postgresql: "9.4" install: + - cp api/api/settings/local-travis.py api/api/settings/local.py - pip install -r dev-requirements.txt - pip install coveralls - cp scrapi/settings/travis-dist.py scrapi/settings/local.py -before_script: flake8 . +before_script: + - flake8 . + - psql -c "CREATE DATABASE scrapi;" -U postgres + - python manage.py migrate script: - invoke provider_map diff --git a/api/__init__.py b/api/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/api/api/__init__.py b/api/api/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/api/api/settings/__init__.py b/api/api/settings/__init__.py new file mode 100644 index 00000000..34a6c824 --- /dev/null +++ b/api/api/settings/__init__.py @@ -0,0 +1,6 @@ +from .defaults import * + +try: + from .local import * +except ImportError as error: + raise ImportError("No local.py settings file found.") diff --git a/api/api/settings/defaults.py b/api/api/settings/defaults.py new file mode 100644 index 00000000..b297a5c3 --- /dev/null +++ b/api/api/settings/defaults.py @@ -0,0 +1,121 @@ +""" +Django settings for shareregistration project. + +For more information on this file, see +https://docs.djangoproject.com/en/1.7/topics/settings/ + +For the full list of settings and their values, see +https://docs.djangoproject.com/en/1.7/ref/settings/ +""" + +# Build paths inside the project like this: os.path.join(BASE_DIR, ...) +import os + +from django.conf import global_settings + +BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) + + +# SECURITY WARNING: keep the secret key used in production secret! +SECRET_KEY = '...' + +# SECURITY WARNING: don't run with debug turned on in production! +DEBUG = False + +TEMPLATE_DEBUG = True + +DOMAIN = 'http://localhost:8000' + +ALLOWED_HOSTS = [ + '.osf.io', + 'testserver' +] + +# Application definition + +INSTALLED_APPS = ( + 'django.contrib.admin', + 'django.contrib.auth', + 'django.contrib.sessions', + 'django.contrib.messages', + 'django.contrib.staticfiles', + 'django.contrib.contenttypes', + 'django.contrib.sites', + 'api.webview', + 'rest_framework', + 'corsheaders' +) + +SITE_ID = 1 + +ACCOUNT_ACTIVATION_DAYS = 7 + +MIDDLEWARE_CLASSES = ( + 'corsheaders.middleware.CorsMiddleware', + 'django.contrib.sessions.middleware.SessionMiddleware', + 'django.middleware.common.CommonMiddleware', + 'django.middleware.csrf.CsrfViewMiddleware', + 'django.contrib.auth.middleware.AuthenticationMiddleware', + 'django.contrib.auth.middleware.SessionAuthenticationMiddleware', + 'django.contrib.messages.middleware.MessageMiddleware', + 'django.middleware.clickjacking.XFrameOptionsMiddleware', +) + +TEMPLATE_CONTEXT_PROCESSORS = global_settings.TEMPLATE_CONTEXT_PROCESSORS + ( + 'django.core.context_processors.debug', + 'django.contrib.auth.context_processors.auth', +) + +ROOT_URLCONF = 'api.api.urls' + + +DATABASES = { + 'default': { + 'ENGINE': 'django.db.backends.postgresql_psycopg2', + 'NAME': 'scrapi' + } +} + +# Internationalization +# https://docs.djangoproject.com/en/1.7/topics/i18n/ + +LANGUAGE_CODE = 'en-us' + +TIME_ZONE = 'EST' + +USE_I18N = True + +USE_L10N = True + +USE_TZ = True + + +# Static files (CSS, JavaScript, Images) +# https://docs.djangoproject.com/en/1.7/howto/static-files/ + +STATICFILES_DIRS = ( + os.path.join( + os.path.dirname(__file__), + '..', # up one level from the settings directory + 'static' + ), +) + +STATIC_ROOT = os.path.join('..', BASE_DIR, 'static') +STATIC_URL = '{}/static/'.format(DOMAIN) + +TEMPLATE_DIRS = [os.path.join(BASE_DIR, 'templates')] + +REST_FRAMEWORK = { + 'PAGE_SIZE': 10, + 'DEFAULT_AUTHENTICATION_CLASSES': ( + 'rest_framework.authentication.BasicAuthentication', + 'rest_framework.authentication.SessionAuthentication', + 'rest_framework.authentication.TokenAuthentication', + ) +} + +CORS_ORIGIN_WHITELIST = ( + 'osf.io', + 'staging.osf.io' +) diff --git a/api/api/settings/local-dist.py b/api/api/settings/local-dist.py new file mode 100644 index 00000000..934bd1bf --- /dev/null +++ b/api/api/settings/local-dist.py @@ -0,0 +1,25 @@ +SECRET_KEY = 'My Secret Key' + +DEBUG = True + +DOMAIN = 'http://localhost:8000' + +DATABASES = { + 'default': { + 'ENGINE': 'django.db.backends.postgresql_psycopg2', + 'NAME': 'scrapi', + # 'USER': 'name', + # 'PASSWORD': 'password', + # 'HOST': '127.0.0.1', + # 'PORT': '5432' + } +} + +STATIC_URL = '/static/' + + +CORS_ORIGIN_WHITELIST = ( + 'localhost:5000', + 'osf.io', + 'staging.osf.io' +) diff --git a/api/api/settings/local-travis.py b/api/api/settings/local-travis.py new file mode 100644 index 00000000..84c47c0f --- /dev/null +++ b/api/api/settings/local-travis.py @@ -0,0 +1,6 @@ +DATABASES = { + 'default': { + 'ENGINE': 'django.db.backends.postgresql_psycopg2', + 'NAME': 'scrapi', + } +} diff --git a/api/api/static/css/api_custom.css b/api/api/static/css/api_custom.css new file mode 100644 index 00000000..ac9fd29b --- /dev/null +++ b/api/api/static/css/api_custom.css @@ -0,0 +1,64 @@ +/* custom navigation styles */ +.navbar { + width: 100%; + position: fixed; + left: 0; + top: 0; + z-index: 3; +} + +.navbar { + background: #BBBBBB; + color: white; + border: none; + border-top: 5px solid #00B3EA; + border-radius: 0px; +} + +.navbar .nav li, .navbar .nav li a, .navbar .brand:hover { + color: white; +} + +.nav-list > .active > a, .nav-list > .active > a:hover { + background: #105B74; +} + +.navbar .dropdown-menu li a, .navbar .dropdown-menu li { + color: #21B6E8; +} + +.navbar .dropdown-menu li a:hover { + background: #EEEEEE; + color: #197D9E; +} + +.pagination>.disabled>a, +.pagination>.disabled>a:hover, +.pagination>.disabled>a:focus { + cursor: not-allowed; + pointer-events: none; +} + +.pager>.disabled>a, +.pager>.disabled>a:hover, +.pager>.disabled>a:focus { + pointer-events: none; +} + +.pager .next { + margin-left: 10px; +} + +.logo { + width: 60px; + padding-top: 5px; + padding-bottom: 5px; +} + +.str, .atv { + color: #5D615B; +} + +body a { + color: #20B3E4; +} diff --git a/api/api/static/img/share.png b/api/api/static/img/share.png new file mode 100644 index 00000000..9201a663 Binary files /dev/null and b/api/api/static/img/share.png differ diff --git a/api/api/urls.py b/api/api/urls.py new file mode 100644 index 00000000..12b72f36 --- /dev/null +++ b/api/api/urls.py @@ -0,0 +1,22 @@ +"""api URL Configuration + +The `urlpatterns` list routes URLs to views. For more information please see: + https://docs.djangoproject.com/en/1.8/topics/http/urls/ +Examples: +Function views + 1. Add an import: from my_app import views + 2. Add a URL to urlpatterns: url(r'^$', views.home, name='home') +Class-based views + 1. Add an import: from other_app.views import Home + 2. Add a URL to urlpatterns: url(r'^$', Home.as_view(), name='home') +Including another URLconf + 1. Add an import: from blog import urls as blog_urls + 2. Add a URL to urlpatterns: url(r'^blog/', include(blog_urls)) +""" +from django.conf.urls import include, url +from django.contrib import admin + +urlpatterns = [ + url(r'^admin/', include(admin.site.urls)), + url(r'^', include('api.webview.urls')), +] diff --git a/api/api/wsgi.py b/api/api/wsgi.py new file mode 100644 index 00000000..f0ac3a7c --- /dev/null +++ b/api/api/wsgi.py @@ -0,0 +1,16 @@ +""" +WSGI config for api project. + +It exposes the WSGI callable as a module-level variable named ``application``. + +For more information on this file, see +https://docs.djangoproject.com/en/1.8/howto/deployment/wsgi/ +""" + +import os + +from django.core.wsgi import get_wsgi_application + +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "api.settings") + +application = get_wsgi_application() diff --git a/api/templates/base.html b/api/templates/base.html new file mode 100644 index 00000000..d7bd5c9f --- /dev/null +++ b/api/templates/base.html @@ -0,0 +1,30 @@ +{% load staticfiles %} + + + + + + + +
+
+ + + {% block header_content %} + {% endblock %} +
+
+ Questions or feedback? Contact us at share-support@osf.io +
+ {% block content %} + {% endblock %} +
+ + + + + + {% block javascript_bottom %} + {% endblock %} + + diff --git a/api/templates/rest_framework/api.html b/api/templates/rest_framework/api.html new file mode 100644 index 00000000..ba7010bf --- /dev/null +++ b/api/templates/rest_framework/api.html @@ -0,0 +1,29 @@ +{% extends "rest_framework/base.html" %} + {% load url from future %} + {% load rest_framework %} + {% load static %} + + {% block title %}SHARE Notify{% endblock %} + + {% block style %} + {% block bootstrap_theme %} + + + {% endblock %} + + + + {% endblock %} + + {% block branding %} + + {% endblock %} + + {% block userlinks %} + {{ block.super }} + {% if user.is_authenticated %} +
  • Get API Key
  • + {% endif %} + {% endblock %} + + diff --git a/api/webview/__init__.py b/api/webview/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/api/webview/admin.py b/api/webview/admin.py new file mode 100644 index 00000000..e69de29b diff --git a/api/webview/models.py b/api/webview/models.py new file mode 100644 index 00000000..65d58765 --- /dev/null +++ b/api/webview/models.py @@ -0,0 +1,13 @@ +from django.db import models + +from django_pgjson.fields import JsonField + + +class Document(models.Model): + source = models.CharField(max_length=100) + docID = models.CharField(max_length=100) + + providerUpdatedDateTime = models.DateTimeField(null=True) + + raw = JsonField() + normalized = JsonField() diff --git a/api/webview/serializers.py b/api/webview/serializers.py new file mode 100644 index 00000000..decdb7cb --- /dev/null +++ b/api/webview/serializers.py @@ -0,0 +1,10 @@ +from rest_framework import serializers + +from api.webview.models import Document + + +class DocumentSerializer(serializers.ModelSerializer): + + class Meta: + model = Document + fields = ('id', 'providerUpdatedDateTime', 'source', 'docID', 'raw', 'normalized') diff --git a/api/webview/tests.py b/api/webview/tests.py new file mode 100644 index 00000000..c581477c --- /dev/null +++ b/api/webview/tests.py @@ -0,0 +1,29 @@ +import os +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "api.api.settings") + +import pytest +import django +from django.test import TestCase +from rest_framework.test import APIRequestFactory + +from api.webview.views import DocumentList + +django.setup() + + +# TODO - make this work without Django. + +class APIViewTests(TestCase): + + def setUp(self): + self.factory = APIRequestFactory() + + @pytest.mark.postgres + def test_document_view(self): + view = DocumentList.as_view() + request = self.factory.get( + '/documents/' + ) + response = view(request) + + self.assertEqual(response.status_code, 200) diff --git a/api/webview/urls.py b/api/webview/urls.py new file mode 100644 index 00000000..e8bd7a19 --- /dev/null +++ b/api/webview/urls.py @@ -0,0 +1,8 @@ +from django.conf.urls import url +from api.webview import views + +urlpatterns = [ + url(r'^documents/$', views.DocumentList.as_view()), + url(r'^documents/(?P\w+)/$', views.DocumentsFromSource.as_view(), name='source'), + url(r'^documents/(?P[a-z]+)/(?P(.*))/$', views.document_detail, name='document_detail') +] diff --git a/api/webview/views.py b/api/webview/views.py new file mode 100644 index 00000000..20b6f1a0 --- /dev/null +++ b/api/webview/views.py @@ -0,0 +1,58 @@ +from rest_framework import generics +from rest_framework.response import Response +from rest_framework.decorators import api_view +from django.views.decorators.clickjacking import xframe_options_exempt + +from api.webview.models import Document +from api.webview.serializers import DocumentSerializer + + +class DocumentList(generics.ListCreateAPIView): + """ + List all documents in the SHARE API + """ + serializer_class = DocumentSerializer + + def perform_create(self, serializer): + serializer.save(source=self.request.user) + + def get_queryset(self): + """ Return all documents + """ + queryset = Document.objects.all() + + return queryset + + +class DocumentsFromSource(generics.ListCreateAPIView): + """ + List all documents from a particular source + """ + serializer_class = DocumentSerializer + + def perform_create(self, serializer): + serializer.save(source=self.request.user) + + def get_queryset(self): + """ Return queryset based on source + """ + queryset = Document.objects.filter(source=self.kwargs['source']) + + return queryset + + +@api_view(['GET']) +@xframe_options_exempt +def document_detail(request, source, docID): + """ + Retrieve one particular document. + """ + try: + all_sources = Document.objects.filter(source=source) + document = all_sources.get(docID=docID) + except Document.DoesNotExist: + return Response(status=404) + + if request.method == 'GET': + serializer = DocumentSerializer(document) + return Response(serializer.data) diff --git a/dev-requirements.txt b/dev-requirements.txt index 9a1410a5..b43600a3 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -4,6 +4,7 @@ httpretty==0.8.4 pytest-cov==1.8.1 ipdb==0.8 ipython==3.1.0 +django-pytest==0.2.0 pep8>=1.5.7,<1.6.0 pyflakes>=0.8,<0.9 diff --git a/manage.py b/manage.py new file mode 100755 index 00000000..b100cc2d --- /dev/null +++ b/manage.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python +import os +import sys + +if __name__ == "__main__": + os.environ.setdefault("DJANGO_SETTINGS_MODULE", "api.api.settings") + + from django.core.management import execute_from_command_line + + execute_from_command_line(sys.argv) diff --git a/requirements.txt b/requirements.txt index 170db5f1..90866344 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,5 +16,13 @@ furl==0.4.4 jsonschema==2.4.0 jsonpointer==1.7 pycountry==1.10 +<<<<<<< HEAD +djangorestframework==3.1.3 +Django==1.8.2 +django-pgjson==0.3.1 +django-cors-headers==1.1.0 +psycopg2==2.6.1 +======= rfc3987==1.3.4 strict-rfc3339==0.5 +>>>>>>> c503c4194f6aea2dfaa75a17865512ee9d701a75 diff --git a/scrapi/base/__init__.py b/scrapi/base/__init__.py index 11b5f9ab..d8610c85 100644 --- a/scrapi/base/__init__.py +++ b/scrapi/base/__init__.py @@ -81,7 +81,8 @@ class JSONHarvester(BaseHarvester, JSONTransformer): def normalize(self, raw_doc): transformed = self.transform(json.loads(raw_doc['doc']), fail=settings.RAISE_IN_TRANSFORMER) transformed['shareProperties'] = { - 'source': self.short_name + 'source': self.short_name, + 'docID': raw_doc['docID'] } return NormalizedDocument(transformed, clean=True) @@ -92,7 +93,8 @@ class XMLHarvester(BaseHarvester, XMLTransformer): def normalize(self, raw_doc): transformed = self.transform(etree.XML(raw_doc['doc']), fail=settings.RAISE_IN_TRANSFORMER) transformed['shareProperties'] = { - 'source': self.short_name + 'source': self.short_name, + 'docID': raw_doc['docID'] } return NormalizedDocument(transformed, clean=True) diff --git a/scrapi/events.py b/scrapi/events.py index f6fc8eca..db943b83 100644 --- a/scrapi/events.py +++ b/scrapi/events.py @@ -22,6 +22,7 @@ HARVESTER_RUN = 'runHarvester' CHECK_ARCHIVE = 'checkArchive' NORMALIZATION = 'normalization' +PROCESSSING_URIS = 'processingUris' # statuses FAILED = 'failed' diff --git a/scrapi/processing/__init__.py b/scrapi/processing/__init__.py index b08de00c..7a1918dd 100644 --- a/scrapi/processing/__init__.py +++ b/scrapi/processing/__init__.py @@ -34,3 +34,9 @@ def process_raw(raw_doc, kwargs): for p in settings.RAW_PROCESSING: extras = kwargs.get(p, {}) get_processor(p).process_raw(raw_doc, **extras) + + +def process_uris(source, docID, uri, uritype, kwargs): + for p in settings.POST_PROCESSING: + extras = kwargs.get(p, {}) + get_processor(p).process_uris(source, docID, uri, uritype, **extras) diff --git a/scrapi/processing/base.py b/scrapi/processing/base.py index 155736e7..87bcb869 100644 --- a/scrapi/processing/base.py +++ b/scrapi/processing/base.py @@ -6,3 +6,6 @@ def process_raw(self, raw_doc, **kwargs): def process_normalized(self, raw_doc, normalized, **kwargs): pass # pragma: no cover + + def process_uris(self, source, docID, uri, uritype, **kwargs): + pass # pragma: no cover diff --git a/scrapi/processing/postgres.py b/scrapi/processing/postgres.py new file mode 100644 index 00000000..d6c24e69 --- /dev/null +++ b/scrapi/processing/postgres.py @@ -0,0 +1,46 @@ +from __future__ import absolute_import + +import os +import copy +import logging + +from scrapi import events +from scrapi.processing.base import BaseProcessor + +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "api.api.settings") +from api.webview.models import Document + +logger = logging.getLogger(__name__) + + +class PostgresProcessor(BaseProcessor): + NAME = 'postgres' + + @events.logged(events.PROCESSING, 'raw.postgres') + def process_raw(self, raw_doc): + source, docID = raw_doc['source'], raw_doc['docID'] + document = self._get_by_source_id(Document, source, docID) or Document(source=source, docID=docID) + + modified_doc = copy.deepcopy(raw_doc.attributes) + if modified_doc.get('versions'): + modified_doc['versions'] = map(str, modified_doc['versions']) + + document.raw = modified_doc + + document.save() + + @events.logged(events.PROCESSING, 'normalized.postgres') + def process_normalized(self, raw_doc, normalized): + source, docID = raw_doc['source'], raw_doc['docID'] + document = self._get_by_source_id(Document, source, docID) or Document(source=source, docID=docID) + + document.normalized = normalized.attributes + document.providerUpdatedDateTime = normalized['providerUpdatedDateTime'] + + document.save() + + def _get_by_source_id(self, model, source, docID): + try: + return Document.objects.filter(source=source, docID=docID)[0] + except IndexError: + return None diff --git a/scrapi/processing/uri_logging.py b/scrapi/processing/uri_logging.py new file mode 100644 index 00000000..bbc10aa9 --- /dev/null +++ b/scrapi/processing/uri_logging.py @@ -0,0 +1,59 @@ +from __future__ import absolute_import + +import os +import datetime +import requests +import logging +# from furl import furl + +# from scrapi.processing import scrapers +from scrapi.processing.base import BaseProcessor + +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "api.api.settings") +from api.webview.models import Document + + +logger = logging.getLogger(__name__) + + +class UriProcessor(BaseProcessor): + NAME = 'uri_logging' + + def process_uris(self, source, docID, uri, uritype, **kwargs): + try: + document = Document.objects.get(source=source, docID=docID) + processed_normalized = self.save_status_of_uri(document.normalized, uri, uritype) + + document.normalized = processed_normalized + + document.save() + except TypeError: + pass + + def save_status_of_uri(self, normalized, uri, uritype): + uri_status = requests.get(uri) + + status = { + 'actual_uri': uri, + 'uritype': uritype, + 'resolved_uri': uri_status.url, + 'resolved_datetime': datetime.datetime.now(), + 'resolved_status': uri_status.status_code, + 'is_doi': True if 'dx.doi.org' in normalized['uris']['canonicalUri'] else False + } + + try: + normalized['shareProperties']['uri_logs']['status'].append(status) + except KeyError: + normalized['shareProperties']['uri_logs'] = {} + normalized['shareProperties']['uri_logs']['status'] = [status] + + # extra_info = scrapers.collect_scraped(uri) + + # if extra_info: + # try: + # normalized['shareProperties']['scraped_properties'].append(extra_info) + # except KeyError: + # normalized['shareProperties']['scraped_properties'] = [extra_info] + + return normalized diff --git a/scrapi/scrapers/__init__.py b/scrapi/scrapers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/scrapi/scrapers/scrapers.py b/scrapi/scrapers/scrapers.py new file mode 100644 index 00000000..1fe307a1 --- /dev/null +++ b/scrapi/scrapers/scrapers.py @@ -0,0 +1,95 @@ +from furl import furl +from bs4 import BeautifulSoup + +from scrapi.scrapers import utils + + +def collect_scraped(uri): + + base = furl(uri).host.replace('www.', '') + if base == 'sciencedirect.com': + info = science_direct(uri) + elif base == 'link.springer.com': + info = springer_link(uri) + else: + info = {} + + return info + + +# Science Direct + + +def science_direct(uri): + ''' + Future potential +
    +
  • + ''' + + return parse_sd_author_list(uri) + + +def parse_sd_author_list(uri): + + soup = BeautifulSoup(utils.get_html_from_link(uri), "lxml") + + auth_affil = soup.find_all("ul", class_="authorGroup") + lis = [item.find_all('li') for item in auth_affil][0] + theas = [thing.find_all('a') for thing in lis] + + full_author_info = [] + for author_pair in theas: + author_info = {} + for author_part in author_pair: + author_info = utils.merge_dicts(author_part.attrs, author_info) + author_info['matcher'] = author_info['href'][0] + + full_author_info.append(author_info) + + # Get affiliations and put them with the original author dict + outer_affilations = soup.find_all("ul", class_="authAffil") + affiliations = [item.find_all('li') for item in outer_affilations] + + affils = [] + for person in affiliations: + for result in person: + d = result.attrs + d['institution'] = result.text + d['matcher'] = d['id'] + affils.append(d) + + all_authors = [] + for author in full_author_info: + del author['data-pos'] + del author['data-t'] + del author['data-tb'] + + for affil in affils: + if author['matcher'].replace('#', '') == affil['matcher']: + combined = author.copy() + combined.update(affil) + all_authors.append(combined) + del author['matcher'] + + return all_authors + + +# Springer Link + +def springer_link(uri): + element = utils.get_elements_from_link(uri) + return {'open_access': get_springer_open_access(element)} + + +def get_springer_open_access(element): + links = element.xpath('//a') + words = [] + for link in links: + if 'viewtype' in link.keys(): + if 'webtrekk-track' in link.get('class'): + words.append(link.get('viewtype')) + if 'Denial' in words: + return False + else: + return True diff --git a/scrapi/scrapers/utils.py b/scrapi/scrapers/utils.py new file mode 100644 index 00000000..f02c8699 --- /dev/null +++ b/scrapi/scrapers/utils.py @@ -0,0 +1,26 @@ +import requests +from lxml import etree + + +def merge_dicts(*dicts): + d = {} + for dict in dicts: + for key in dict: + try: + d[key].append(dict[key]) + except KeyError: + d[key] = [dict[key]] + for key in d: + if len(d[key]) == 1: + d[key] = d[key][0] + + return d + + +def get_elements_from_link(link): + content = requests.get(link).content + return etree.HTML(content) + + +def get_html_from_link(link): + return requests.get(link).content diff --git a/scrapi/settings/local-dist.py b/scrapi/settings/local-dist.py index 826d3528..5d6ada4a 100644 --- a/scrapi/settings/local-dist.py +++ b/scrapi/settings/local-dist.py @@ -12,6 +12,7 @@ NORMALIZED_PROCESSING = [] RAW_PROCESSING = [] +POST_PROCESSING = [] SENTRY_DSN = None diff --git a/scrapi/tasks.py b/scrapi/tasks.py index c196d84e..948605e2 100644 --- a/scrapi/tasks.py +++ b/scrapi/tasks.py @@ -1,4 +1,5 @@ import logging +import json import functools from itertools import islice from datetime import date, timedelta @@ -125,6 +126,49 @@ def process_normalized(normalized_doc, raw_doc, **kwargs): processing.process_normalized(raw_doc, normalized_doc, kwargs) +@task_autoretry(default_retry_delay=settings.CELERY_RETRY_DELAY, max_retries=0) +@events.logged(events.PROCESSSING_URIS, 'uri_processing') +def process_uris(async, **kwargs): + settings.CELERY_ALWAYS_EAGER = not async + + all_buckets = [] + if kwargs.get('source'): + source_buckets = util.parse_urls_into_groups(kwargs['source']) + all_buckets.append(source_buckets) + else: + for source in registry.keys(): + source_buckets = util.parse_urls_into_groups(source) + all_buckets.append(source_buckets) + + with open('all_sources.json', 'w') as outfile: + json.dump(all_buckets, outfile) + + # for source_dict in all_buckets: + # for group in source_dict['uris']: + # process_uris_at_one_base_uri.delay(group['individual_uris'], async, kwargs=kwargs) + + +@task_autoretry(default_retry_delay=settings.CELERY_RETRY_DELAY, max_retries=0) +@events.logged(events.PROCESSSING_URIS, 'uri_processing') +def process_uris_at_one_base_uri(uri_list, async=False, **kwargs): + settings.CELERY_ALWAYS_EAGER = not async + + for uri in uri_list: + process_one_uri.delay(uri, kwargs=kwargs) + + +@task_autoretry(default_retry_delay=settings.CELERY_RETRY_DELAY, max_retries=0, rate_limit='5/s') +@events.logged(events.PROCESSSING_URIS, 'uri_processing') +def process_one_uri(uri, **kwargs): + processing.process_uris( + source=uri['source'], + docID=uri['docID'], + uri=uri['uri'], + uritype=uri['uritype'], + kwargs=kwargs + ) + + @app.task def migrate(migration, sources=tuple(), async=False, dry=True, group_size=1000, **kwargs): from scrapi.migrations import documents diff --git a/scrapi/util.py b/scrapi/util.py index 0116141d..86bf0b08 100644 --- a/scrapi/util.py +++ b/scrapi/util.py @@ -1,8 +1,15 @@ from datetime import datetime +import os +import re import six import pytz +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "api.api.settings") +from api.webview.models import Document + +URL_RE = re.compile(r'(https?:\/\/[^\/]*)') + def timestamp(): return pytz.utc.localize(datetime.utcnow()).isoformat() @@ -51,3 +58,63 @@ def json_without_bytes(jobj): if isinstance(v, six.binary_type): jobj[k] = v.decode('utf8') return jobj + + +def parse_urls_into_groups(source): + + source_dict = {'source': source, 'uris': [], 'all_bases': []} + for document in Document.objects.filter(source=source): + if document.normalized: + docID = document.normalized['shareProperties']['docID'] + + source_dict = uri_processing( + document.normalized['uris']['canonicalUri'], + source, + docID, + source_dict, + 'cannonicalUri' + ) + + if document.normalized['uris'].get('providerUris'): + for uri in document.normalized['uris']['providerUris']: + source_dict = uri_processing(uri, source, docID, source_dict, 'providerUris') + if document.normalized['uris'].get('descriptorUris'): + for uri in document.normalized['uris']['descriptorUris']: + source_dict = uri_processing(uri, source, docID, source_dict, 'descriptorUris') + if document.normalized['uris'].get('objectUris'): + for uri in document.normalized['uris']['objectUris']: + if uri: + if isinstance(uri, list): + for element in uri: + source_dict = uri_processing(element, source, docID, source_dict, 'objectUris') + else: + source_dict = uri_processing(uri, source, docID, source_dict, 'objectUris') + + return source_dict + + +def uri_processing(uri, source, docID, source_dict, uritype): + base_uri = URL_RE.search(uri).group() + + if base_uri in source_dict['all_bases']: + for entry in source_dict['uris']: + if base_uri == entry['base_uri']: + entry['individual_uris'].append({ + 'uri': uri, + 'source': source, + 'docID': docID, + 'uritype': uritype + }) + else: + source_dict['uris'].append({ + 'base_uri': base_uri, + 'individual_uris': [{ + 'uri': uri, + 'source': source, + 'docID': docID, + 'uritype': uritype + }] + }) + source_dict['all_bases'].append(base_uri) + + return source_dict diff --git a/tasks.py b/tasks.py index e4adbc1a..3dba470e 100644 --- a/tasks.py +++ b/tasks.py @@ -1,3 +1,4 @@ +import os import base64 import logging import platform @@ -13,19 +14,20 @@ from scrapi import registry from scrapi import settings -from scrapi.processing.elasticsearch import es logger = logging.getLogger() @task def reindex(src, dest): + from scrapi.processing.elasticsearch import es helpers.reindex(es, src, dest) es.indices.delete(src) @task def alias(alias, index): + from scrapi.processing.elasticsearch import es es.indices.delete_alias(index=alias, name='_all', ignore=404) es.indices.put_alias(alias, index) @@ -122,7 +124,7 @@ def test(cov=True, verbose=False, debug=False): if debug: cmd += ' -s' if cov: - cmd += ' --cov-report term-missing --cov-config .coveragerc --cov scrapi' + cmd += ' --cov-report term-missing --cov-config .coveragerc --cov scrapi --cov api' run(cmd, pty=True) @@ -185,6 +187,14 @@ def harvesters(async=False, start=None, end=None): logger.exception(e) +@task +def process_uris(async=False, source=None): + settings.CELERY_ALWAYS_EAGER = not async + from scrapi.tasks import process_uris + + process_uris.delay(async=async, source=source) + + @task def lint_all(): for name in registry.keys(): @@ -224,3 +234,25 @@ def provider_map(delete=False): refresh=True ) print(es.count('share_providers', body={'query': {'match_all': {}}})['count']) + + +@task +def apiserver(): + os.system('python manage.py runserver') + + +@task +def reset_all(): + try: + input = raw_input + except Exception: + pass + if input('Are you sure? y/N ') != 'y': + return + os.system('psql -c "DROP DATABASE scrapi;"') + os.system('psql -c "CREATE DATABASE scrapi;"') + os.system('python manage.py migrate') + + os.system("curl -XDELETE '{}/share*'".format(settings.ELASTIC_URI)) + os.system("invoke alias share share_v2") + os.system("invoke provider_map") diff --git a/tests/test_api_views.py b/tests/test_api_views.py new file mode 100644 index 00000000..61b84d53 --- /dev/null +++ b/tests/test_api_views.py @@ -0,0 +1,43 @@ +import os +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "api.api.settings") + +import django +from django.test import TestCase +from rest_framework.test import APIRequestFactory + +from api.webview.views import DocumentList + +django.setup() + + +class APIViewTests(TestCase): + + def setUp(self): + self.factory = APIRequestFactory() + + def test_document_view(self): + view = DocumentList.as_view() + request = self.factory.get( + '/documents/' + ) + response = view(request) + + self.assertEqual(response.status_code, 200) + + def test_source_view(self): + view = DocumentList.as_view() + request = self.factory.get( + '/documents/dudley_weekly/' + ) + response = view(request) + + self.assertEqual(response.status_code, 200) + + def test_individual_view(self): + view = DocumentList.as_view() + request = self.factory.get( + '/documents/dudley_weekly/dudley1' + ) + response = view(request) + + self.assertEqual(response.status_code, 200) diff --git a/tests/test_json_harvester.py b/tests/test_json_harvester.py index b54e4b19..8960f734 100644 --- a/tests/test_json_harvester.py +++ b/tests/test_json_harvester.py @@ -33,7 +33,8 @@ }, "providerUpdatedDateTime": "2015-02-02T00:00:00+00:00", "shareProperties": { - "source": "test" + "source": "test", + "docID": "1" }, "otherProperties": [ { diff --git a/tests/test_migrations.py b/tests/test_migrations.py index b7fecbd1..5807f2d7 100644 --- a/tests/test_migrations.py +++ b/tests/test_migrations.py @@ -12,14 +12,15 @@ from scrapi.migrations import rename from scrapi.migrations import renormalize from scrapi.migrations import DocumentModelOld -from scrapi.migrations import document_v2_migration # Need to force cassandra to ignore set keyspace from scrapi.processing.cassandra import CassandraProcessor, DocumentModel +from scrapi.processing.postgres import PostgresProcessor from . import utils test_cass = CassandraProcessor() +test_postgres = PostgresProcessor() test_harvester = utils.TestHarvester() diff --git a/tests/test_postgres_processor.py b/tests/test_postgres_processor.py new file mode 100644 index 00000000..841db7ed --- /dev/null +++ b/tests/test_postgres_processor.py @@ -0,0 +1,31 @@ +# import pytest + +import os +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "api.api.settings") + +import django +from django.test import TestCase +from scrapi.processing.postgres import PostgresProcessor, Document + +from . import utils +from scrapi.linter.document import RawDocument, NormalizedDocument + +django.setup() + +test_db = PostgresProcessor() + +RAW = RawDocument(utils.POSTGRES_RAW_DOC) +NORMALIZED = NormalizedDocument(utils.RECORD) + + +class DocumentTestCase(TestCase): + + def test_raw_processing(self): + test_db.process_raw(RAW) + queryset = Document(docID='someID', source=RAW['source']) + assert queryset.docID == RAW.attributes['docID'] + + def test_normalized_processing(self): + test_db.process_normalized(RAW, NORMALIZED) + queryset = Document(docID=RAW['docID'], source=RAW['source']) + assert(queryset.source == NORMALIZED['shareProperties']['source']) diff --git a/tests/utils.py b/tests/utils.py index 770812c6..a9d13d0b 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -21,6 +21,18 @@ 'source': 'test' } +POSTGRES_RAW_DOC = { + 'doc': '{}', + 'docID': 'someID', + 'timestamps': { + 'harvestFinished': '2012-11-30T17:05:48+00:00', + 'harvestStarted': '2012-11-30T17:05:48+00:00', + 'harvestTaskCreated': '2012-11-30T17:05:48+00:00' + }, + 'filetype': 'json', + 'source': 'test' +} + NORMALIZED_DOC = { 'title': 'No', 'contributors': [{'name': ''}],