CERNDocumentServer · jochenklein · Sep 9, 2015 · Sep 9, 2015 · Mar 8, 2014 · Feb 23, 2015
diff --git a/INSTALL b/INSTALL
@@ -83,6 +83,9 @@ Contents
         natively in UTF-8 mode by setting "default-character-set=utf8"
         in various parts of your "my.cnf" file, such as in the
         "[mysql]" part and elsewhere; but this is not really required.
+        Note also that you may encounter problems when MySQL is run in
+        "strict mode"; you may want to configure your "my.cnf" in order
+        to avoid using strict mode (such as `STRICT_ALL_TABLES`).
         <http://mysql.com/>
 
      c) Redis server (may be on a remote machine) for user session

diff --git a/modules/bibdocfile/lib/bibdocfile.py b/modules/bibdocfile/lib/bibdocfile.py
@@ -59,6 +59,7 @@
 import cgi
 import sys
 import copy
+import tarfile
 
 if sys.hexversion < 0x2060000:
     from md5 import md5
@@ -102,6 +103,7 @@
     encode_for_xml
 from invenio.urlutils import create_url, make_user_agent_string
 from invenio.textutils import nice_size
+from invenio.webuser import collect_user_info
 from invenio.access_control_engine import acc_authorize_action
 from invenio.access_control_admin import acc_is_user_in_role, acc_get_role_id
 from invenio.access_control_firerole import compile_role_definition, acc_firerole_check_user
@@ -123,7 +125,7 @@
     CFG_BIBCATALOG_SYSTEM
 from invenio.bibcatalog import BIBCATALOG_SYSTEM
 from invenio.bibdocfile_config import CFG_BIBDOCFILE_ICON_SUBFORMAT_RE, \
-    CFG_BIBDOCFILE_DEFAULT_ICON_SUBFORMAT
+    CFG_BIBDOCFILE_DEFAULT_ICON_SUBFORMAT, CFG_BIBDOCFILE_STREAM_ARCHIVE_FORMATS
 from invenio.pluginutils import PluginContainer
 
 import invenio.template
@@ -797,17 +799,22 @@ def get_xml_8564(self):
 
         return out
 
-    def get_total_size_latest_version(self):
+    def get_total_size_latest_version(self, user_info=None, subformat=None):
         """
         Returns the total size used on disk by all the files belonging
         to this record and corresponding to the latest version.
 
+        @param user_info: the user_info dictionary, used to check restrictions
+        @type: dict
+        @param subformat: if subformat is specified, it limits files
+            only to those from that specific subformat
+        @type subformat: string
         @return: the total size.
         @rtype: integer
         """
         size = 0
         for (bibdoc, _) in self.bibdocs.values():
-            size += bibdoc.get_total_size_latest_version()
+            size += bibdoc.get_total_size_latest_version(user_info, subformat)
         return size
 
     def get_total_size(self):
@@ -1564,6 +1571,36 @@ def get_text(self, extract_text_if_necessary=True):
 
         return " ".join(texts)
 
+    def stream_archive_of_latest_files(self, req, files_size=''):
+        """
+        Streams the tar archive with all files of a certain file size (that
+        are not restricted or hidden) to the user.
+        File size should be a string that can be compared with the output of
+        BibDocFile.get_subformat() function.
+
+        @param req: Apache Request Object
+        @type req: Apache Request Object
+        @param files_size: size of the files (they can be defined in
+        bibdocfile_config). Empty string means the original size.
+        @type files_size: string
+        """
+        # Get the internal size from the user-friendly file size name
+        internal_format = [f[1] for f in CFG_BIBDOCFILE_STREAM_ARCHIVE_FORMATS if f[0] == files_size]
+        if len(internal_format) < 1:
+            # Incorrect file size
+            return
+        internal_format = internal_format[0]
+        tarname = str(self.id) + "_" + files_size + '.tar'
+
+        # Select files that user can download (not hidden nor restricted)
+        user_info = collect_user_info(req)
+        req.content_type = "application/x-tar"
+        req.headers_out["Content-Disposition"] = 'attachment; filename="%s"' % tarname
+        tar = tarfile.open(fileobj=req, mode='w|')
+        for f in self.list_latest_files():
+            if f.get_subformat() == internal_format and f.is_restricted(user_info)[0] == 0 and not f.hidden:
+                tar.add(f.get_path(), arcname=f.get_full_name(), recursive=False)
+        tar.close()
 
 class BibDoc(object):
     """
@@ -2801,12 +2838,28 @@ def _build_related_file_list(self):
                 cur_doc = BibDoc.create_instance(docid=docid, human_readable=self.human_readable)
                 self.related_files[doctype].append(cur_doc)
 
-    def get_total_size_latest_version(self):
+    def get_total_size_latest_version(self, user_info=None, subformat=None):
         """Return the total size used on disk of all the files belonging
-        to this bibdoc and corresponding to the latest version."""
+        to this bibdoc and corresponding to the latest version. Restricted
+        and hidden files are not counted, unless there is no user_info.
+        @param user_info: the user_info dictionary, used to check restrictions
+        @type: dict
+        @param subformat: if subformat is specified, it limits files
+            only to those from that specific subformat
+        @type subformat: string
+        """
         ret = 0
+        all_files = False
+        # If we are calling this function without user_info, then we want to
+        # see all the files
+        if not user_info:
+            all_files = True
         for bibdocfile in self.list_latest_files():
-            ret += bibdocfile.get_size()
+            # First check for restrictions
+            if all_files or (bibdocfile.is_restricted(user_info)[0] == 0 and not bibdocfile.hidden):
+                # Then check if the format is correct
+                if subformat is None or bibdocfile.get_subformat() == subformat:
+                    ret += bibdocfile.get_size()
         return ret
 
     def get_total_size(self):

diff --git a/modules/bibdocfile/lib/bibdocfile_config.py b/modules/bibdocfile/lib/bibdocfile_config.py
@@ -69,3 +69,12 @@
 # CFG_BIBDOCFILE_DEFAULT_ICON_SUBFORMAT -- this is the default subformat used
 # when creating new icons.
 CFG_BIBDOCFILE_DEFAULT_ICON_SUBFORMAT = "icon"
+
+# CFG_BIBDOCFILE_STREAM_ARCHIVE_FORMATS -- a list (not dictionary, because
+# we want to preserve the order) that connects the different format sizes
+# (like 'small', 'medium', etc.) with internal format sizes (like 'icon-180', 'icon-640', etc.)
+CFG_BIBDOCFILE_STREAM_ARCHIVE_FORMATS = [
+    ('small', 'icon-180'),
+    ('medium', 'icon-640'),
+    ('large', 'icon-1440'),
+    ('original', '')]
diff --git a/modules/bibdocfile/lib/bibdocfile_webinterface.py b/modules/bibdocfile/lib/bibdocfile_webinterface.py
@@ -83,6 +83,12 @@ def _lookup(self, component, path):
         def getfile(req, form):
             args = wash_urlargd(form, bibdocfile_templates.files_default_urlargd)
             ln = args['ln']
+            if filename[:9] == "allfiles-":
+                files_size = filename[9:]
+                # stream a tar package to the user
+                brd = BibRecDocs(self.recid)
+                brd.stream_archive_of_latest_files(req, files_size)
+                return
 
             _ = gettext_set_language(ln)
 

diff --git a/modules/bibformat/lib/bibreformat.py b/modules/bibformat/lib/bibreformat.py
@@ -353,7 +353,11 @@ def task_run_core():
         if task_has_option("last"):
             recids += outdated_caches(fmt, last_updated)
 
-        if task_has_option('ignore_without'):
+        if task_has_option('ignore_without') or \
+                task_has_option('collection') or \
+                task_has_option('field') or \
+                task_has_option('pattern') or \
+                task_has_option('recids'):
             without_fmt = intbitset()
         else:
             without_fmt = missing_caches(fmt)

diff --git a/modules/bibsched/lib/tasklets/bst_inspirehep_names_mapper.py b/modules/bibsched/lib/tasklets/bst_inspirehep_names_mapper.py
@@ -0,0 +1,130 @@
+# This file is part of Invenio.
+# Copyright (C) 2016 CERN.
+#
+# Invenio is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation; either version 2 of the
+# License, or (at your option) any later version.
+#
+# Invenio is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Invenio; if not, write to the Free Software Foundation, Inc.,
+# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+
+"""INSPIRE-HEP IDs to names mapping (Invenio Bibliographic Tasklet).
+
+Harvest records on INSPIRE-HEP including MARC fields 035__9 and 035__a. Map
+INSPIRE-IDs to INSPIRE-HEP-names, and write the dictionary to a JSON file.
+
+Output example: {"INSPIRE-12345": "john.1", ...}
+
+Usage:
+$bibtasklet -N inspirehep-names-mapper
+    -T bst_inspirehep_names_mapper [-a json_file
+        [default: invenio.config.CFG_CACHEDIR/inspirehep-names-mapping.json]]
+"""
+
+import time
+import urllib2
+import xml.etree.ElementTree as ET
+from os.path import join
+from sys import stderr
+
+from invenio.bibauthority_people_utils import (
+    export_json, UtilsError, version_file)
+from invenio.bibtask import write_message
+from invenio.config import CFG_CACHEDIR
+
+INSPIREHEP_NAMES_MAPPING_FILE = join(
+    CFG_CACHEDIR, "inspirehep-names-mapping.json")
+ns = {"x": "http://www.loc.gov/MARC21/slim"}  # XML namespaces
+
+
+def get_records(record_limit=250):
+    """Get MARCXML record elements.
+
+    :param int record_limit: records limit each request. Maximum 251,
+        except if you are a superadmin
+    :return: list of MARCXML record elements or empty list
+    """
+    counter = 1
+    records_all = []
+
+    url = (
+        "https://inspirehep.net/search?cc=HepNames"
+        "&p=035__9%3ABAI+035__%3AINSPIRE&of=xm&ot=035&rg={0}&jrec={1}")
+
+    while 1:
+        req = urllib2.Request(url.format(record_limit, counter))
+        try:
+            response = urllib2.urlopen(req)
+        except urllib2.URLError as e:
+            raise e
+        page_result = response.read()
+        root = ET.fromstring(page_result)
+        records = root.findall(".//x:record", namespaces=ns) or []
+
+        if not records:
+            break
+
+        records_all = records_all + records
+        counter += record_limit
+
+        # Sleep some seconds between every request not to be banned
+        time.sleep(3)
+
+    return records_all
+
+
+def get_mapping(inspire_records):
+    """Get mapping INSPIRE-ID to INSPIRE-HEP-name.
+
+    :param list inspire_records: list of MARCXML record elements
+    :return: dictionary containing the mapping
+    """
+    inspire_mapping = {}
+
+    for record in inspire_records:
+        inspire_id = inspire_name = None
+
+        datafields = record.findall("x:datafield[@tag='035']", namespaces=ns)
+        for datafield in datafields:
+            subfield = datafield.find("x:subfield[@code='9']", namespaces=ns)
+            if subfield is not None:
+                subfield_a = datafield.find(
+                    "x:subfield[@code='a']", namespaces=ns)
+                if subfield_a is not None:
+                    if (subfield.text == "INSPIRE"):
+                        inspire_id = subfield_a.text
+                    elif (subfield.text == "BAI"):
+                        inspire_name = subfield_a.text
+
+        inspire_mapping[inspire_id] = inspire_name
+
+    return inspire_mapping
+
+
+def bst_inspirehep_names_mapper(json_file=INSPIREHEP_NAMES_MAPPING_FILE):
+    """Map INSPIRE-IDs to INSPIRE-HEP-names and write to JSON.
+
+    :param filepath json_file: path to JSON file containing the INSPIRE mapping
+    """
+    try:
+        records = get_records()
+        write_message("{0} records (HepNames) fetched.".format(len(records)))
+        mapping = get_mapping(records)
+        if mapping:
+            version_file(json_file, 1)
+            try:
+                export_json(mapping, json_file)
+                write_message(
+                    "Mapping for INSPIRE-HEP-ids and -names exported to JSON. "
+                    "See '{0}'.".format(json_file))
+            except UtilsError as e:
+                write_message(e.reason, stderr)
+    except urllib2.URLError as e:
+        write_message(e.reason, stderr)
diff --git a/modules/webaccess/doc/hacking/webaccess-api.webdoc b/modules/webaccess/doc/hacking/webaccess-api.webdoc
@@ -23,7 +23,7 @@
 <pre>
 Invenio Access Control Engine can be called from within your Python programs
 via both a regular Python API and CLI.
-In addition the you get an explanation of the program flow.
+In addition to the above features, you also get an explanation of the program flow.
 
 Contents:
  1. Regular API

diff --git a/modules/websearch/lib/search_engine.py b/modules/websearch/lib/search_engine.py
@@ -97,7 +97,8 @@
      InvenioWebSearchReferstoLimitError, \
      InvenioWebSearchCitedbyLimitError, \
      CFG_WEBSEARCH_IDXPAIRS_FIELDS,\
-     CFG_WEBSEARCH_IDXPAIRS_EXACT_SEARCH
+     CFG_WEBSEARCH_IDXPAIRS_EXACT_SEARCH, \
+     CFG_WEBSEARCH_BLACKLISTED_FORMATS
 from invenio.search_engine_utils import (get_fieldvalues,
                                          get_fieldvalues_alephseq_like,
                                          record_exists)
@@ -1711,14 +1712,20 @@ def get_synonym_terms(term, kbr_name, match_type, use_memoise=False):
     return dterms.keys()
 
 
-def wash_output_format(ouput_format):
+def wash_output_format(ouput_format, verbose=False, req=None):
     """Wash output format FORMAT.  Currently only prevents input like
     'of=9' for backwards-compatible format that prints certain fields
     only.  (for this task, 'of=tm' is preferred)"""
     if str(ouput_format[0:3]).isdigit() and len(ouput_format) != 6:
         # asked to print MARC tags, but not enough digits,
         # so let's switch back to HTML brief default
         return 'hb'
+    elif format in CFG_WEBSEARCH_BLACKLISTED_FORMATS:
+        if verbose:
+            write_warning("Selected format is not available through perform_request_search", req=req)
+            # Returning an empty list seems dangerous because you wouldn't know
+            # right away that the list is not supposed to be empty.
+        return 'hb'
     else:
         return ouput_format
 
@@ -5714,7 +5721,7 @@ def prs_wash_arguments(req=None, cc=CFG_SITE_NAME, c=None, p="", f="", rg=CFG_WE
     """
 
     # wash output format:
-    of = wash_output_format(of)
+    of = wash_output_format(of, verbose=verbose, req=req)
 
     # wash all arguments requiring special care
     p = wash_pattern(p)

diff --git a/modules/websearch/lib/search_engine_config.py b/modules/websearch/lib/search_engine_config.py
@@ -50,6 +50,11 @@
 # interfaces (0=simple, 1=advanced, 2=add-to-search):
 CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES = [0,1,2]
 
+# CFG_WEBSEARCH_BLACKLISTED_FORMATS -- list of formats that will be refused
+# by perform_request_search:
+# * recstruct is an internal format thus should not be exposed
+CFG_WEBSEARCH_BLACKLISTED_FORMATS = ["recstruct", "wapaff", "wapdat"]
+
 
 class InvenioWebSearchUnknownCollectionError(Exception):
     """Exception for bad collection."""

diff --git a/modules/websearch/lib/websearch_external_collections_getter_unit_tests.py b/modules/websearch/lib/websearch_external_collections_getter_unit_tests.py
@@ -51,7 +51,7 @@ def test_async_download(self):
         ##   - test 1 bad IP: 1.2.3.4
         ## Return the list of errors.
         checks = [
-            {'url': 'http://invenio-software.org', 'content': 'About Invenio'},
+            {'url': 'http://invenio-software.org', 'content': 'Invenio'},
             {'url': 'http://rjfreijoiregjreoijgoirg.fr'},
             {'url': 'http://1.2.3.4/'}]
 

diff --git a/modules/websubmit/lib/functions/Stamp_Uploaded_Files.py b/modules/websubmit/lib/functions/Stamp_Uploaded_Files.py
@@ -302,8 +302,8 @@ def visit_for_stamping(visit_for_stamping_arguments, dirname, filenames):
         'file_stamper_options' members.
        @param dirname: (string) - the path to the directory in which the
         files are to be stamped.
-       @param filenames: (list) - the names of each file in dirname. An
-        attempt will be made to stamp each of these files.
+       @param filenames: (list) - the names of each file and subdirectory in
+        dirname. An attempt will be made to stamp each of the files.
        @Exceptions Raised:
          + InvenioWebSubmitFunctionWarning;
          + InvenioWebSubmitFunctionError;
@@ -345,6 +345,10 @@ def visit_for_stamping(visit_for_stamping_arguments, dirname, filenames):
         path_to_subject_file = "%s/%s" % (dirname, file_to_stamp)
         file_stamper_options['input-file'] = path_to_subject_file
 
+        if not os.path.isfile(path_to_subject_file):
+            # If it's not a file, we can't stamp it. Continue with next file
+            continue
+
         ## Just before attempting to stamp the file, log the dictionary of
         ## options (file_stamper_options) that will be passed to websubmit-
         ## file-stamper: