diff --git a/INSTALL b/INSTALL
index f1ad87d6ea..e584746094 100644
--- a/INSTALL
+++ b/INSTALL
@@ -83,6 +83,9 @@ Contents
natively in UTF-8 mode by setting "default-character-set=utf8"
in various parts of your "my.cnf" file, such as in the
"[mysql]" part and elsewhere; but this is not really required.
+ Note also that you may encounter problems when MySQL is run in
+ "strict mode"; you may want to configure your "my.cnf" in order
+ to avoid using strict mode (such as `STRICT_ALL_TABLES`).
c) Redis server (may be on a remote machine) for user session
diff --git a/modules/bibdocfile/lib/bibdocfile.py b/modules/bibdocfile/lib/bibdocfile.py
index 12598b36ac..0cada52da5 100644
--- a/modules/bibdocfile/lib/bibdocfile.py
+++ b/modules/bibdocfile/lib/bibdocfile.py
@@ -59,6 +59,7 @@
import cgi
import sys
import copy
+import tarfile
if sys.hexversion < 0x2060000:
from md5 import md5
@@ -102,6 +103,7 @@
encode_for_xml
from invenio.urlutils import create_url, make_user_agent_string
from invenio.textutils import nice_size
+from invenio.webuser import collect_user_info
from invenio.access_control_engine import acc_authorize_action
from invenio.access_control_admin import acc_is_user_in_role, acc_get_role_id
from invenio.access_control_firerole import compile_role_definition, acc_firerole_check_user
@@ -123,7 +125,7 @@
CFG_BIBCATALOG_SYSTEM
from invenio.bibcatalog import BIBCATALOG_SYSTEM
from invenio.bibdocfile_config import CFG_BIBDOCFILE_ICON_SUBFORMAT_RE, \
- CFG_BIBDOCFILE_DEFAULT_ICON_SUBFORMAT
+ CFG_BIBDOCFILE_DEFAULT_ICON_SUBFORMAT, CFG_BIBDOCFILE_STREAM_ARCHIVE_FORMATS
from invenio.pluginutils import PluginContainer
import invenio.template
@@ -797,17 +799,22 @@ def get_xml_8564(self):
return out
- def get_total_size_latest_version(self):
+ def get_total_size_latest_version(self, user_info=None, subformat=None):
"""
Returns the total size used on disk by all the files belonging
to this record and corresponding to the latest version.
+ @param user_info: the user_info dictionary, used to check restrictions
+ @type: dict
+ @param subformat: if subformat is specified, it limits files
+ only to those from that specific subformat
+ @type subformat: string
@return: the total size.
@rtype: integer
"""
size = 0
for (bibdoc, _) in self.bibdocs.values():
- size += bibdoc.get_total_size_latest_version()
+ size += bibdoc.get_total_size_latest_version(user_info, subformat)
return size
def get_total_size(self):
@@ -1564,6 +1571,36 @@ def get_text(self, extract_text_if_necessary=True):
return " ".join(texts)
+ def stream_archive_of_latest_files(self, req, files_size=''):
+ """
+ Streams the tar archive with all files of a certain file size (that
+ are not restricted or hidden) to the user.
+ File size should be a string that can be compared with the output of
+ BibDocFile.get_subformat() function.
+
+ @param req: Apache Request Object
+ @type req: Apache Request Object
+ @param files_size: size of the files (they can be defined in
+ bibdocfile_config). Empty string means the original size.
+ @type files_size: string
+ """
+ # Get the internal size from the user-friendly file size name
+ internal_format = [f[1] for f in CFG_BIBDOCFILE_STREAM_ARCHIVE_FORMATS if f[0] == files_size]
+ if len(internal_format) < 1:
+ # Incorrect file size
+ return
+ internal_format = internal_format[0]
+ tarname = str(self.id) + "_" + files_size + '.tar'
+
+ # Select files that user can download (not hidden nor restricted)
+ user_info = collect_user_info(req)
+ req.content_type = "application/x-tar"
+ req.headers_out["Content-Disposition"] = 'attachment; filename="%s"' % tarname
+ tar = tarfile.open(fileobj=req, mode='w|')
+ for f in self.list_latest_files():
+ if f.get_subformat() == internal_format and f.is_restricted(user_info)[0] == 0 and not f.hidden:
+ tar.add(f.get_path(), arcname=f.get_full_name(), recursive=False)
+ tar.close()
class BibDoc(object):
"""
@@ -2801,12 +2838,28 @@ def _build_related_file_list(self):
cur_doc = BibDoc.create_instance(docid=docid, human_readable=self.human_readable)
self.related_files[doctype].append(cur_doc)
- def get_total_size_latest_version(self):
+ def get_total_size_latest_version(self, user_info=None, subformat=None):
"""Return the total size used on disk of all the files belonging
- to this bibdoc and corresponding to the latest version."""
+ to this bibdoc and corresponding to the latest version. Restricted
+ and hidden files are not counted, unless there is no user_info.
+ @param user_info: the user_info dictionary, used to check restrictions
+ @type: dict
+ @param subformat: if subformat is specified, it limits files
+ only to those from that specific subformat
+ @type subformat: string
+ """
ret = 0
+ all_files = False
+ # If we are calling this function without user_info, then we want to
+ # see all the files
+ if not user_info:
+ all_files = True
for bibdocfile in self.list_latest_files():
- ret += bibdocfile.get_size()
+ # First check for restrictions
+ if all_files or (bibdocfile.is_restricted(user_info)[0] == 0 and not bibdocfile.hidden):
+ # Then check if the format is correct
+ if subformat is None or bibdocfile.get_subformat() == subformat:
+ ret += bibdocfile.get_size()
return ret
def get_total_size(self):
diff --git a/modules/bibdocfile/lib/bibdocfile_config.py b/modules/bibdocfile/lib/bibdocfile_config.py
index 0caa8ddbd2..c07eb606ff 100644
--- a/modules/bibdocfile/lib/bibdocfile_config.py
+++ b/modules/bibdocfile/lib/bibdocfile_config.py
@@ -69,3 +69,12 @@
# CFG_BIBDOCFILE_DEFAULT_ICON_SUBFORMAT -- this is the default subformat used
# when creating new icons.
CFG_BIBDOCFILE_DEFAULT_ICON_SUBFORMAT = "icon"
+
+# CFG_BIBDOCFILE_STREAM_ARCHIVE_FORMATS -- a list (not dictionary, because
+# we want to preserve the order) that connects the different format sizes
+# (like 'small', 'medium', etc.) with internal format sizes (like 'icon-180', 'icon-640', etc.)
+CFG_BIBDOCFILE_STREAM_ARCHIVE_FORMATS = [
+ ('small', 'icon-180'),
+ ('medium', 'icon-640'),
+ ('large', 'icon-1440'),
+ ('original', '')]
diff --git a/modules/bibdocfile/lib/bibdocfile_webinterface.py b/modules/bibdocfile/lib/bibdocfile_webinterface.py
index 1547dd3ddf..198b72fb17 100644
--- a/modules/bibdocfile/lib/bibdocfile_webinterface.py
+++ b/modules/bibdocfile/lib/bibdocfile_webinterface.py
@@ -83,6 +83,12 @@ def _lookup(self, component, path):
def getfile(req, form):
args = wash_urlargd(form, bibdocfile_templates.files_default_urlargd)
ln = args['ln']
+ if filename[:9] == "allfiles-":
+ files_size = filename[9:]
+ # stream a tar package to the user
+ brd = BibRecDocs(self.recid)
+ brd.stream_archive_of_latest_files(req, files_size)
+ return
_ = gettext_set_language(ln)
diff --git a/modules/bibformat/lib/bibreformat.py b/modules/bibformat/lib/bibreformat.py
index 1571146adb..9c712dcbd3 100644
--- a/modules/bibformat/lib/bibreformat.py
+++ b/modules/bibformat/lib/bibreformat.py
@@ -353,7 +353,11 @@ def task_run_core():
if task_has_option("last"):
recids += outdated_caches(fmt, last_updated)
- if task_has_option('ignore_without'):
+ if task_has_option('ignore_without') or \
+ task_has_option('collection') or \
+ task_has_option('field') or \
+ task_has_option('pattern') or \
+ task_has_option('recids'):
without_fmt = intbitset()
else:
without_fmt = missing_caches(fmt)
diff --git a/modules/bibsched/lib/tasklets/bst_inspirehep_names_mapper.py b/modules/bibsched/lib/tasklets/bst_inspirehep_names_mapper.py
new file mode 100644
index 0000000000..2203b412eb
--- /dev/null
+++ b/modules/bibsched/lib/tasklets/bst_inspirehep_names_mapper.py
@@ -0,0 +1,130 @@
+# This file is part of Invenio.
+# Copyright (C) 2016 CERN.
+#
+# Invenio is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation; either version 2 of the
+# License, or (at your option) any later version.
+#
+# Invenio is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Invenio; if not, write to the Free Software Foundation, Inc.,
+# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+
+"""INSPIRE-HEP IDs to names mapping (Invenio Bibliographic Tasklet).
+
+Harvest records on INSPIRE-HEP including MARC fields 035__9 and 035__a. Map
+INSPIRE-IDs to INSPIRE-HEP-names, and write the dictionary to a JSON file.
+
+Output example: {"INSPIRE-12345": "john.1", ...}
+
+Usage:
+$bibtasklet -N inspirehep-names-mapper
+ -T bst_inspirehep_names_mapper [-a json_file
+ [default: invenio.config.CFG_CACHEDIR/inspirehep-names-mapping.json]]
+"""
+
+import time
+import urllib2
+import xml.etree.ElementTree as ET
+from os.path import join
+from sys import stderr
+
+from invenio.bibauthority_people_utils import (
+ export_json, UtilsError, version_file)
+from invenio.bibtask import write_message
+from invenio.config import CFG_CACHEDIR
+
+INSPIREHEP_NAMES_MAPPING_FILE = join(
+ CFG_CACHEDIR, "inspirehep-names-mapping.json")
+ns = {"x": "http://www.loc.gov/MARC21/slim"} # XML namespaces
+
+
+def get_records(record_limit=250):
+ """Get MARCXML record elements.
+
+ :param int record_limit: records limit each request. Maximum 251,
+ except if you are a superadmin
+ :return: list of MARCXML record elements or empty list
+ """
+ counter = 1
+ records_all = []
+
+ url = (
+ "https://inspirehep.net/search?cc=HepNames"
+ "&p=035__9%3ABAI+035__%3AINSPIRE&of=xm&ot=035&rg={0}&jrec={1}")
+
+ while 1:
+ req = urllib2.Request(url.format(record_limit, counter))
+ try:
+ response = urllib2.urlopen(req)
+ except urllib2.URLError as e:
+ raise e
+ page_result = response.read()
+ root = ET.fromstring(page_result)
+ records = root.findall(".//x:record", namespaces=ns) or []
+
+ if not records:
+ break
+
+ records_all = records_all + records
+ counter += record_limit
+
+ # Sleep some seconds between every request not to be banned
+ time.sleep(3)
+
+ return records_all
+
+
+def get_mapping(inspire_records):
+ """Get mapping INSPIRE-ID to INSPIRE-HEP-name.
+
+ :param list inspire_records: list of MARCXML record elements
+ :return: dictionary containing the mapping
+ """
+ inspire_mapping = {}
+
+ for record in inspire_records:
+ inspire_id = inspire_name = None
+
+ datafields = record.findall("x:datafield[@tag='035']", namespaces=ns)
+ for datafield in datafields:
+ subfield = datafield.find("x:subfield[@code='9']", namespaces=ns)
+ if subfield is not None:
+ subfield_a = datafield.find(
+ "x:subfield[@code='a']", namespaces=ns)
+ if subfield_a is not None:
+ if (subfield.text == "INSPIRE"):
+ inspire_id = subfield_a.text
+ elif (subfield.text == "BAI"):
+ inspire_name = subfield_a.text
+
+ inspire_mapping[inspire_id] = inspire_name
+
+ return inspire_mapping
+
+
+def bst_inspirehep_names_mapper(json_file=INSPIREHEP_NAMES_MAPPING_FILE):
+ """Map INSPIRE-IDs to INSPIRE-HEP-names and write to JSON.
+
+ :param filepath json_file: path to JSON file containing the INSPIRE mapping
+ """
+ try:
+ records = get_records()
+ write_message("{0} records (HepNames) fetched.".format(len(records)))
+ mapping = get_mapping(records)
+ if mapping:
+ version_file(json_file, 1)
+ try:
+ export_json(mapping, json_file)
+ write_message(
+ "Mapping for INSPIRE-HEP-ids and -names exported to JSON. "
+ "See '{0}'.".format(json_file))
+ except UtilsError as e:
+ write_message(e.reason, stderr)
+ except urllib2.URLError as e:
+ write_message(e.reason, stderr)
diff --git a/modules/webaccess/doc/hacking/webaccess-api.webdoc b/modules/webaccess/doc/hacking/webaccess-api.webdoc
index f657c7ad42..dbec7eed81 100644
--- a/modules/webaccess/doc/hacking/webaccess-api.webdoc
+++ b/modules/webaccess/doc/hacking/webaccess-api.webdoc
@@ -23,7 +23,7 @@
Invenio Access Control Engine can be called from within your Python programs
via both a regular Python API and CLI.
-In addition the you get an explanation of the program flow.
+In addition to the above features, you also get an explanation of the program flow.
Contents:
1. Regular API
diff --git a/modules/websearch/lib/search_engine.py b/modules/websearch/lib/search_engine.py
index ab15175be8..eb78a0cc09 100644
--- a/modules/websearch/lib/search_engine.py
+++ b/modules/websearch/lib/search_engine.py
@@ -97,7 +97,8 @@
InvenioWebSearchReferstoLimitError, \
InvenioWebSearchCitedbyLimitError, \
CFG_WEBSEARCH_IDXPAIRS_FIELDS,\
- CFG_WEBSEARCH_IDXPAIRS_EXACT_SEARCH
+ CFG_WEBSEARCH_IDXPAIRS_EXACT_SEARCH, \
+ CFG_WEBSEARCH_BLACKLISTED_FORMATS
from invenio.search_engine_utils import (get_fieldvalues,
get_fieldvalues_alephseq_like,
record_exists)
@@ -1711,7 +1712,7 @@ def get_synonym_terms(term, kbr_name, match_type, use_memoise=False):
return dterms.keys()
-def wash_output_format(ouput_format):
+def wash_output_format(ouput_format, verbose=False, req=None):
"""Wash output format FORMAT. Currently only prevents input like
'of=9' for backwards-compatible format that prints certain fields
only. (for this task, 'of=tm' is preferred)"""
@@ -1719,6 +1720,12 @@ def wash_output_format(ouput_format):
# asked to print MARC tags, but not enough digits,
# so let's switch back to HTML brief default
return 'hb'
+ elif format in CFG_WEBSEARCH_BLACKLISTED_FORMATS:
+ if verbose:
+ write_warning("Selected format is not available through perform_request_search", req=req)
+ # Returning an empty list seems dangerous because you wouldn't know
+ # right away that the list is not supposed to be empty.
+ return 'hb'
else:
return ouput_format
@@ -5714,7 +5721,7 @@ def prs_wash_arguments(req=None, cc=CFG_SITE_NAME, c=None, p="", f="", rg=CFG_WE
"""
# wash output format:
- of = wash_output_format(of)
+ of = wash_output_format(of, verbose=verbose, req=req)
# wash all arguments requiring special care
p = wash_pattern(p)
diff --git a/modules/websearch/lib/search_engine_config.py b/modules/websearch/lib/search_engine_config.py
index 17d8daf1da..2bc848a427 100644
--- a/modules/websearch/lib/search_engine_config.py
+++ b/modules/websearch/lib/search_engine_config.py
@@ -50,6 +50,11 @@
# interfaces (0=simple, 1=advanced, 2=add-to-search):
CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES = [0,1,2]
+# CFG_WEBSEARCH_BLACKLISTED_FORMATS -- list of formats that will be refused
+# by perform_request_search:
+# * recstruct is an internal format thus should not be exposed
+CFG_WEBSEARCH_BLACKLISTED_FORMATS = ["recstruct", "wapaff", "wapdat"]
+
class InvenioWebSearchUnknownCollectionError(Exception):
"""Exception for bad collection."""
diff --git a/modules/websearch/lib/websearch_external_collections_getter_unit_tests.py b/modules/websearch/lib/websearch_external_collections_getter_unit_tests.py
index 8d55a05730..5ac036d9b7 100644
--- a/modules/websearch/lib/websearch_external_collections_getter_unit_tests.py
+++ b/modules/websearch/lib/websearch_external_collections_getter_unit_tests.py
@@ -51,7 +51,7 @@ def test_async_download(self):
## - test 1 bad IP: 1.2.3.4
## Return the list of errors.
checks = [
- {'url': 'http://invenio-software.org', 'content': 'About Invenio'},
+ {'url': 'http://invenio-software.org', 'content': 'Invenio'},
{'url': 'http://rjfreijoiregjreoijgoirg.fr'},
{'url': 'http://1.2.3.4/'}]
diff --git a/modules/websubmit/lib/functions/Stamp_Uploaded_Files.py b/modules/websubmit/lib/functions/Stamp_Uploaded_Files.py
index e8b5b42147..40c9cee7e6 100644
--- a/modules/websubmit/lib/functions/Stamp_Uploaded_Files.py
+++ b/modules/websubmit/lib/functions/Stamp_Uploaded_Files.py
@@ -302,8 +302,8 @@ def visit_for_stamping(visit_for_stamping_arguments, dirname, filenames):
'file_stamper_options' members.
@param dirname: (string) - the path to the directory in which the
files are to be stamped.
- @param filenames: (list) - the names of each file in dirname. An
- attempt will be made to stamp each of these files.
+ @param filenames: (list) - the names of each file and subdirectory in
+ dirname. An attempt will be made to stamp each of the files.
@Exceptions Raised:
+ InvenioWebSubmitFunctionWarning;
+ InvenioWebSubmitFunctionError;
@@ -345,6 +345,10 @@ def visit_for_stamping(visit_for_stamping_arguments, dirname, filenames):
path_to_subject_file = "%s/%s" % (dirname, file_to_stamp)
file_stamper_options['input-file'] = path_to_subject_file
+ if not os.path.isfile(path_to_subject_file):
+ # If it's not a file, we can't stamp it. Continue with next file
+ continue
+
## Just before attempting to stamp the file, log the dictionary of
## options (file_stamper_options) that will be passed to websubmit-
## file-stamper: