diff --git a/INSTALL b/INSTALL index f1ad87d6ea..e584746094 100644 --- a/INSTALL +++ b/INSTALL @@ -83,6 +83,9 @@ Contents natively in UTF-8 mode by setting "default-character-set=utf8" in various parts of your "my.cnf" file, such as in the "[mysql]" part and elsewhere; but this is not really required. + Note also that you may encounter problems when MySQL is run in + "strict mode"; you may want to configure your "my.cnf" in order + to avoid using strict mode (such as `STRICT_ALL_TABLES`). c) Redis server (may be on a remote machine) for user session diff --git a/modules/bibdocfile/lib/bibdocfile.py b/modules/bibdocfile/lib/bibdocfile.py index 12598b36ac..0cada52da5 100644 --- a/modules/bibdocfile/lib/bibdocfile.py +++ b/modules/bibdocfile/lib/bibdocfile.py @@ -59,6 +59,7 @@ import cgi import sys import copy +import tarfile if sys.hexversion < 0x2060000: from md5 import md5 @@ -102,6 +103,7 @@ encode_for_xml from invenio.urlutils import create_url, make_user_agent_string from invenio.textutils import nice_size +from invenio.webuser import collect_user_info from invenio.access_control_engine import acc_authorize_action from invenio.access_control_admin import acc_is_user_in_role, acc_get_role_id from invenio.access_control_firerole import compile_role_definition, acc_firerole_check_user @@ -123,7 +125,7 @@ CFG_BIBCATALOG_SYSTEM from invenio.bibcatalog import BIBCATALOG_SYSTEM from invenio.bibdocfile_config import CFG_BIBDOCFILE_ICON_SUBFORMAT_RE, \ - CFG_BIBDOCFILE_DEFAULT_ICON_SUBFORMAT + CFG_BIBDOCFILE_DEFAULT_ICON_SUBFORMAT, CFG_BIBDOCFILE_STREAM_ARCHIVE_FORMATS from invenio.pluginutils import PluginContainer import invenio.template @@ -797,17 +799,22 @@ def get_xml_8564(self): return out - def get_total_size_latest_version(self): + def get_total_size_latest_version(self, user_info=None, subformat=None): """ Returns the total size used on disk by all the files belonging to this record and corresponding to the latest version. + @param user_info: the user_info dictionary, used to check restrictions + @type: dict + @param subformat: if subformat is specified, it limits files + only to those from that specific subformat + @type subformat: string @return: the total size. @rtype: integer """ size = 0 for (bibdoc, _) in self.bibdocs.values(): - size += bibdoc.get_total_size_latest_version() + size += bibdoc.get_total_size_latest_version(user_info, subformat) return size def get_total_size(self): @@ -1564,6 +1571,36 @@ def get_text(self, extract_text_if_necessary=True): return " ".join(texts) + def stream_archive_of_latest_files(self, req, files_size=''): + """ + Streams the tar archive with all files of a certain file size (that + are not restricted or hidden) to the user. + File size should be a string that can be compared with the output of + BibDocFile.get_subformat() function. + + @param req: Apache Request Object + @type req: Apache Request Object + @param files_size: size of the files (they can be defined in + bibdocfile_config). Empty string means the original size. + @type files_size: string + """ + # Get the internal size from the user-friendly file size name + internal_format = [f[1] for f in CFG_BIBDOCFILE_STREAM_ARCHIVE_FORMATS if f[0] == files_size] + if len(internal_format) < 1: + # Incorrect file size + return + internal_format = internal_format[0] + tarname = str(self.id) + "_" + files_size + '.tar' + + # Select files that user can download (not hidden nor restricted) + user_info = collect_user_info(req) + req.content_type = "application/x-tar" + req.headers_out["Content-Disposition"] = 'attachment; filename="%s"' % tarname + tar = tarfile.open(fileobj=req, mode='w|') + for f in self.list_latest_files(): + if f.get_subformat() == internal_format and f.is_restricted(user_info)[0] == 0 and not f.hidden: + tar.add(f.get_path(), arcname=f.get_full_name(), recursive=False) + tar.close() class BibDoc(object): """ @@ -2801,12 +2838,28 @@ def _build_related_file_list(self): cur_doc = BibDoc.create_instance(docid=docid, human_readable=self.human_readable) self.related_files[doctype].append(cur_doc) - def get_total_size_latest_version(self): + def get_total_size_latest_version(self, user_info=None, subformat=None): """Return the total size used on disk of all the files belonging - to this bibdoc and corresponding to the latest version.""" + to this bibdoc and corresponding to the latest version. Restricted + and hidden files are not counted, unless there is no user_info. + @param user_info: the user_info dictionary, used to check restrictions + @type: dict + @param subformat: if subformat is specified, it limits files + only to those from that specific subformat + @type subformat: string + """ ret = 0 + all_files = False + # If we are calling this function without user_info, then we want to + # see all the files + if not user_info: + all_files = True for bibdocfile in self.list_latest_files(): - ret += bibdocfile.get_size() + # First check for restrictions + if all_files or (bibdocfile.is_restricted(user_info)[0] == 0 and not bibdocfile.hidden): + # Then check if the format is correct + if subformat is None or bibdocfile.get_subformat() == subformat: + ret += bibdocfile.get_size() return ret def get_total_size(self): diff --git a/modules/bibdocfile/lib/bibdocfile_config.py b/modules/bibdocfile/lib/bibdocfile_config.py index 0caa8ddbd2..c07eb606ff 100644 --- a/modules/bibdocfile/lib/bibdocfile_config.py +++ b/modules/bibdocfile/lib/bibdocfile_config.py @@ -69,3 +69,12 @@ # CFG_BIBDOCFILE_DEFAULT_ICON_SUBFORMAT -- this is the default subformat used # when creating new icons. CFG_BIBDOCFILE_DEFAULT_ICON_SUBFORMAT = "icon" + +# CFG_BIBDOCFILE_STREAM_ARCHIVE_FORMATS -- a list (not dictionary, because +# we want to preserve the order) that connects the different format sizes +# (like 'small', 'medium', etc.) with internal format sizes (like 'icon-180', 'icon-640', etc.) +CFG_BIBDOCFILE_STREAM_ARCHIVE_FORMATS = [ + ('small', 'icon-180'), + ('medium', 'icon-640'), + ('large', 'icon-1440'), + ('original', '')] diff --git a/modules/bibdocfile/lib/bibdocfile_webinterface.py b/modules/bibdocfile/lib/bibdocfile_webinterface.py index 1547dd3ddf..198b72fb17 100644 --- a/modules/bibdocfile/lib/bibdocfile_webinterface.py +++ b/modules/bibdocfile/lib/bibdocfile_webinterface.py @@ -83,6 +83,12 @@ def _lookup(self, component, path): def getfile(req, form): args = wash_urlargd(form, bibdocfile_templates.files_default_urlargd) ln = args['ln'] + if filename[:9] == "allfiles-": + files_size = filename[9:] + # stream a tar package to the user + brd = BibRecDocs(self.recid) + brd.stream_archive_of_latest_files(req, files_size) + return _ = gettext_set_language(ln) diff --git a/modules/bibformat/lib/bibreformat.py b/modules/bibformat/lib/bibreformat.py index 1571146adb..9c712dcbd3 100644 --- a/modules/bibformat/lib/bibreformat.py +++ b/modules/bibformat/lib/bibreformat.py @@ -353,7 +353,11 @@ def task_run_core(): if task_has_option("last"): recids += outdated_caches(fmt, last_updated) - if task_has_option('ignore_without'): + if task_has_option('ignore_without') or \ + task_has_option('collection') or \ + task_has_option('field') or \ + task_has_option('pattern') or \ + task_has_option('recids'): without_fmt = intbitset() else: without_fmt = missing_caches(fmt) diff --git a/modules/bibsched/lib/tasklets/bst_inspirehep_names_mapper.py b/modules/bibsched/lib/tasklets/bst_inspirehep_names_mapper.py new file mode 100644 index 0000000000..2203b412eb --- /dev/null +++ b/modules/bibsched/lib/tasklets/bst_inspirehep_names_mapper.py @@ -0,0 +1,130 @@ +# This file is part of Invenio. +# Copyright (C) 2016 CERN. +# +# Invenio is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# Invenio is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Invenio; if not, write to the Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + +"""INSPIRE-HEP IDs to names mapping (Invenio Bibliographic Tasklet). + +Harvest records on INSPIRE-HEP including MARC fields 035__9 and 035__a. Map +INSPIRE-IDs to INSPIRE-HEP-names, and write the dictionary to a JSON file. + +Output example: {"INSPIRE-12345": "john.1", ...} + +Usage: +$bibtasklet -N inspirehep-names-mapper + -T bst_inspirehep_names_mapper [-a json_file + [default: invenio.config.CFG_CACHEDIR/inspirehep-names-mapping.json]] +""" + +import time +import urllib2 +import xml.etree.ElementTree as ET +from os.path import join +from sys import stderr + +from invenio.bibauthority_people_utils import ( + export_json, UtilsError, version_file) +from invenio.bibtask import write_message +from invenio.config import CFG_CACHEDIR + +INSPIREHEP_NAMES_MAPPING_FILE = join( + CFG_CACHEDIR, "inspirehep-names-mapping.json") +ns = {"x": "http://www.loc.gov/MARC21/slim"} # XML namespaces + + +def get_records(record_limit=250): + """Get MARCXML record elements. + + :param int record_limit: records limit each request. Maximum 251, + except if you are a superadmin + :return: list of MARCXML record elements or empty list + """ + counter = 1 + records_all = [] + + url = ( + "https://inspirehep.net/search?cc=HepNames" + "&p=035__9%3ABAI+035__%3AINSPIRE&of=xm&ot=035&rg={0}&jrec={1}") + + while 1: + req = urllib2.Request(url.format(record_limit, counter)) + try: + response = urllib2.urlopen(req) + except urllib2.URLError as e: + raise e + page_result = response.read() + root = ET.fromstring(page_result) + records = root.findall(".//x:record", namespaces=ns) or [] + + if not records: + break + + records_all = records_all + records + counter += record_limit + + # Sleep some seconds between every request not to be banned + time.sleep(3) + + return records_all + + +def get_mapping(inspire_records): + """Get mapping INSPIRE-ID to INSPIRE-HEP-name. + + :param list inspire_records: list of MARCXML record elements + :return: dictionary containing the mapping + """ + inspire_mapping = {} + + for record in inspire_records: + inspire_id = inspire_name = None + + datafields = record.findall("x:datafield[@tag='035']", namespaces=ns) + for datafield in datafields: + subfield = datafield.find("x:subfield[@code='9']", namespaces=ns) + if subfield is not None: + subfield_a = datafield.find( + "x:subfield[@code='a']", namespaces=ns) + if subfield_a is not None: + if (subfield.text == "INSPIRE"): + inspire_id = subfield_a.text + elif (subfield.text == "BAI"): + inspire_name = subfield_a.text + + inspire_mapping[inspire_id] = inspire_name + + return inspire_mapping + + +def bst_inspirehep_names_mapper(json_file=INSPIREHEP_NAMES_MAPPING_FILE): + """Map INSPIRE-IDs to INSPIRE-HEP-names and write to JSON. + + :param filepath json_file: path to JSON file containing the INSPIRE mapping + """ + try: + records = get_records() + write_message("{0} records (HepNames) fetched.".format(len(records))) + mapping = get_mapping(records) + if mapping: + version_file(json_file, 1) + try: + export_json(mapping, json_file) + write_message( + "Mapping for INSPIRE-HEP-ids and -names exported to JSON. " + "See '{0}'.".format(json_file)) + except UtilsError as e: + write_message(e.reason, stderr) + except urllib2.URLError as e: + write_message(e.reason, stderr) diff --git a/modules/webaccess/doc/hacking/webaccess-api.webdoc b/modules/webaccess/doc/hacking/webaccess-api.webdoc index f657c7ad42..dbec7eed81 100644 --- a/modules/webaccess/doc/hacking/webaccess-api.webdoc +++ b/modules/webaccess/doc/hacking/webaccess-api.webdoc @@ -23,7 +23,7 @@
 Invenio Access Control Engine can be called from within your Python programs
 via both a regular Python API and CLI.
-In addition the you get an explanation of the program flow.
+In addition to the above features, you also get an explanation of the program flow.
 
 Contents:
  1. Regular API
diff --git a/modules/websearch/lib/search_engine.py b/modules/websearch/lib/search_engine.py
index ab15175be8..eb78a0cc09 100644
--- a/modules/websearch/lib/search_engine.py
+++ b/modules/websearch/lib/search_engine.py
@@ -97,7 +97,8 @@
      InvenioWebSearchReferstoLimitError, \
      InvenioWebSearchCitedbyLimitError, \
      CFG_WEBSEARCH_IDXPAIRS_FIELDS,\
-     CFG_WEBSEARCH_IDXPAIRS_EXACT_SEARCH
+     CFG_WEBSEARCH_IDXPAIRS_EXACT_SEARCH, \
+     CFG_WEBSEARCH_BLACKLISTED_FORMATS
 from invenio.search_engine_utils import (get_fieldvalues,
                                          get_fieldvalues_alephseq_like,
                                          record_exists)
@@ -1711,7 +1712,7 @@ def get_synonym_terms(term, kbr_name, match_type, use_memoise=False):
     return dterms.keys()
 
 
-def wash_output_format(ouput_format):
+def wash_output_format(ouput_format, verbose=False, req=None):
     """Wash output format FORMAT.  Currently only prevents input like
     'of=9' for backwards-compatible format that prints certain fields
     only.  (for this task, 'of=tm' is preferred)"""
@@ -1719,6 +1720,12 @@ def wash_output_format(ouput_format):
         # asked to print MARC tags, but not enough digits,
         # so let's switch back to HTML brief default
         return 'hb'
+    elif format in CFG_WEBSEARCH_BLACKLISTED_FORMATS:
+        if verbose:
+            write_warning("Selected format is not available through perform_request_search", req=req)
+            # Returning an empty list seems dangerous because you wouldn't know
+            # right away that the list is not supposed to be empty.
+        return 'hb'
     else:
         return ouput_format
 
@@ -5714,7 +5721,7 @@ def prs_wash_arguments(req=None, cc=CFG_SITE_NAME, c=None, p="", f="", rg=CFG_WE
     """
 
     # wash output format:
-    of = wash_output_format(of)
+    of = wash_output_format(of, verbose=verbose, req=req)
 
     # wash all arguments requiring special care
     p = wash_pattern(p)
diff --git a/modules/websearch/lib/search_engine_config.py b/modules/websearch/lib/search_engine_config.py
index 17d8daf1da..2bc848a427 100644
--- a/modules/websearch/lib/search_engine_config.py
+++ b/modules/websearch/lib/search_engine_config.py
@@ -50,6 +50,11 @@
 # interfaces (0=simple, 1=advanced, 2=add-to-search):
 CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES = [0,1,2]
 
+# CFG_WEBSEARCH_BLACKLISTED_FORMATS -- list of formats that will be refused
+# by perform_request_search:
+# * recstruct is an internal format thus should not be exposed
+CFG_WEBSEARCH_BLACKLISTED_FORMATS = ["recstruct", "wapaff", "wapdat"]
+
 
 class InvenioWebSearchUnknownCollectionError(Exception):
     """Exception for bad collection."""
diff --git a/modules/websearch/lib/websearch_external_collections_getter_unit_tests.py b/modules/websearch/lib/websearch_external_collections_getter_unit_tests.py
index 8d55a05730..5ac036d9b7 100644
--- a/modules/websearch/lib/websearch_external_collections_getter_unit_tests.py
+++ b/modules/websearch/lib/websearch_external_collections_getter_unit_tests.py
@@ -51,7 +51,7 @@ def test_async_download(self):
         ##   - test 1 bad IP: 1.2.3.4
         ## Return the list of errors.
         checks = [
-            {'url': 'http://invenio-software.org', 'content': 'About Invenio'},
+            {'url': 'http://invenio-software.org', 'content': 'Invenio'},
             {'url': 'http://rjfreijoiregjreoijgoirg.fr'},
             {'url': 'http://1.2.3.4/'}]
 
diff --git a/modules/websubmit/lib/functions/Stamp_Uploaded_Files.py b/modules/websubmit/lib/functions/Stamp_Uploaded_Files.py
index e8b5b42147..40c9cee7e6 100644
--- a/modules/websubmit/lib/functions/Stamp_Uploaded_Files.py
+++ b/modules/websubmit/lib/functions/Stamp_Uploaded_Files.py
@@ -302,8 +302,8 @@ def visit_for_stamping(visit_for_stamping_arguments, dirname, filenames):
         'file_stamper_options' members.
        @param dirname: (string) - the path to the directory in which the
         files are to be stamped.
-       @param filenames: (list) - the names of each file in dirname. An
-        attempt will be made to stamp each of these files.
+       @param filenames: (list) - the names of each file and subdirectory in
+        dirname. An attempt will be made to stamp each of the files.
        @Exceptions Raised:
          + InvenioWebSubmitFunctionWarning;
          + InvenioWebSubmitFunctionError;
@@ -345,6 +345,10 @@ def visit_for_stamping(visit_for_stamping_arguments, dirname, filenames):
         path_to_subject_file = "%s/%s" % (dirname, file_to_stamp)
         file_stamper_options['input-file'] = path_to_subject_file
 
+        if not os.path.isfile(path_to_subject_file):
+            # If it's not a file, we can't stamp it. Continue with next file
+            continue
+
         ## Just before attempting to stamp the file, log the dictionary of
         ## options (file_stamper_options) that will be passed to websubmit-
         ## file-stamper: