Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions INSTALL
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,9 @@ Contents
natively in UTF-8 mode by setting "default-character-set=utf8"
in various parts of your "my.cnf" file, such as in the
"[mysql]" part and elsewhere; but this is not really required.
Note also that you may encounter problems when MySQL is run in
"strict mode"; you may want to configure your "my.cnf" in order
to avoid using strict mode (such as `STRICT_ALL_TABLES`).
<http://mysql.com/>

c) Redis server (may be on a remote machine) for user session
Expand Down
65 changes: 59 additions & 6 deletions modules/bibdocfile/lib/bibdocfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
import cgi
import sys
import copy
import tarfile

if sys.hexversion < 0x2060000:
from md5 import md5
Expand Down Expand Up @@ -102,6 +103,7 @@
encode_for_xml
from invenio.urlutils import create_url, make_user_agent_string
from invenio.textutils import nice_size
from invenio.webuser import collect_user_info
from invenio.access_control_engine import acc_authorize_action
from invenio.access_control_admin import acc_is_user_in_role, acc_get_role_id
from invenio.access_control_firerole import compile_role_definition, acc_firerole_check_user
Expand All @@ -123,7 +125,7 @@
CFG_BIBCATALOG_SYSTEM
from invenio.bibcatalog import BIBCATALOG_SYSTEM
from invenio.bibdocfile_config import CFG_BIBDOCFILE_ICON_SUBFORMAT_RE, \
CFG_BIBDOCFILE_DEFAULT_ICON_SUBFORMAT
CFG_BIBDOCFILE_DEFAULT_ICON_SUBFORMAT, CFG_BIBDOCFILE_STREAM_ARCHIVE_FORMATS
from invenio.pluginutils import PluginContainer

import invenio.template
Expand Down Expand Up @@ -797,17 +799,22 @@ def get_xml_8564(self):

return out

def get_total_size_latest_version(self):
def get_total_size_latest_version(self, user_info=None, subformat=None):
"""
Returns the total size used on disk by all the files belonging
to this record and corresponding to the latest version.

@param user_info: the user_info dictionary, used to check restrictions
@type: dict
@param subformat: if subformat is specified, it limits files
only to those from that specific subformat
@type subformat: string
@return: the total size.
@rtype: integer
"""
size = 0
for (bibdoc, _) in self.bibdocs.values():
size += bibdoc.get_total_size_latest_version()
size += bibdoc.get_total_size_latest_version(user_info, subformat)
return size

def get_total_size(self):
Expand Down Expand Up @@ -1564,6 +1571,36 @@ def get_text(self, extract_text_if_necessary=True):

return " ".join(texts)

def stream_archive_of_latest_files(self, req, files_size=''):
"""
Streams the tar archive with all files of a certain file size (that
are not restricted or hidden) to the user.
File size should be a string that can be compared with the output of
BibDocFile.get_subformat() function.

@param req: Apache Request Object
@type req: Apache Request Object
@param files_size: size of the files (they can be defined in
bibdocfile_config). Empty string means the original size.
@type files_size: string
"""
# Get the internal size from the user-friendly file size name
internal_format = [f[1] for f in CFG_BIBDOCFILE_STREAM_ARCHIVE_FORMATS if f[0] == files_size]
if len(internal_format) < 1:
# Incorrect file size
return
internal_format = internal_format[0]
tarname = str(self.id) + "_" + files_size + '.tar'

# Select files that user can download (not hidden nor restricted)
user_info = collect_user_info(req)
req.content_type = "application/x-tar"
req.headers_out["Content-Disposition"] = 'attachment; filename="%s"' % tarname
tar = tarfile.open(fileobj=req, mode='w|')
for f in self.list_latest_files():
if f.get_subformat() == internal_format and f.is_restricted(user_info)[0] == 0 and not f.hidden:
tar.add(f.get_path(), arcname=f.get_full_name(), recursive=False)
tar.close()

class BibDoc(object):
"""
Expand Down Expand Up @@ -2801,12 +2838,28 @@ def _build_related_file_list(self):
cur_doc = BibDoc.create_instance(docid=docid, human_readable=self.human_readable)
self.related_files[doctype].append(cur_doc)

def get_total_size_latest_version(self):
def get_total_size_latest_version(self, user_info=None, subformat=None):
"""Return the total size used on disk of all the files belonging
to this bibdoc and corresponding to the latest version."""
to this bibdoc and corresponding to the latest version. Restricted
and hidden files are not counted, unless there is no user_info.
@param user_info: the user_info dictionary, used to check restrictions
@type: dict
@param subformat: if subformat is specified, it limits files
only to those from that specific subformat
@type subformat: string
"""
ret = 0
all_files = False
# If we are calling this function without user_info, then we want to
# see all the files
if not user_info:
all_files = True
for bibdocfile in self.list_latest_files():
ret += bibdocfile.get_size()
# First check for restrictions
if all_files or (bibdocfile.is_restricted(user_info)[0] == 0 and not bibdocfile.hidden):
# Then check if the format is correct
if subformat is None or bibdocfile.get_subformat() == subformat:
ret += bibdocfile.get_size()
return ret

def get_total_size(self):
Expand Down
9 changes: 9 additions & 0 deletions modules/bibdocfile/lib/bibdocfile_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,12 @@
# CFG_BIBDOCFILE_DEFAULT_ICON_SUBFORMAT -- this is the default subformat used
# when creating new icons.
CFG_BIBDOCFILE_DEFAULT_ICON_SUBFORMAT = "icon"

# CFG_BIBDOCFILE_STREAM_ARCHIVE_FORMATS -- a list (not dictionary, because
# we want to preserve the order) that connects the different format sizes
# (like 'small', 'medium', etc.) with internal format sizes (like 'icon-180', 'icon-640', etc.)
CFG_BIBDOCFILE_STREAM_ARCHIVE_FORMATS = [
('small', 'icon-180'),
('medium', 'icon-640'),
('large', 'icon-1440'),
('original', '')]
6 changes: 6 additions & 0 deletions modules/bibdocfile/lib/bibdocfile_webinterface.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,12 @@ def _lookup(self, component, path):
def getfile(req, form):
args = wash_urlargd(form, bibdocfile_templates.files_default_urlargd)
ln = args['ln']
if filename[:9] == "allfiles-":
files_size = filename[9:]
# stream a tar package to the user
brd = BibRecDocs(self.recid)
brd.stream_archive_of_latest_files(req, files_size)
return

_ = gettext_set_language(ln)

Expand Down
6 changes: 5 additions & 1 deletion modules/bibformat/lib/bibreformat.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,11 @@ def task_run_core():
if task_has_option("last"):
recids += outdated_caches(fmt, last_updated)

if task_has_option('ignore_without'):
if task_has_option('ignore_without') or \
task_has_option('collection') or \
task_has_option('field') or \
task_has_option('pattern') or \
task_has_option('recids'):
without_fmt = intbitset()
else:
without_fmt = missing_caches(fmt)
Expand Down
130 changes: 130 additions & 0 deletions modules/bibsched/lib/tasklets/bst_inspirehep_names_mapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# This file is part of Invenio.
# Copyright (C) 2016 CERN.
#
# Invenio is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# Invenio is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Invenio; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

"""INSPIRE-HEP IDs to names mapping (Invenio Bibliographic Tasklet).

Harvest records on INSPIRE-HEP including MARC fields 035__9 and 035__a. Map
INSPIRE-IDs to INSPIRE-HEP-names, and write the dictionary to a JSON file.

Output example: {"INSPIRE-12345": "john.1", ...}

Usage:
$bibtasklet -N inspirehep-names-mapper
-T bst_inspirehep_names_mapper [-a json_file
[default: invenio.config.CFG_CACHEDIR/inspirehep-names-mapping.json]]
"""

import time
import urllib2
import xml.etree.ElementTree as ET
from os.path import join
from sys import stderr

from invenio.bibauthority_people_utils import (
export_json, UtilsError, version_file)
from invenio.bibtask import write_message
from invenio.config import CFG_CACHEDIR

INSPIREHEP_NAMES_MAPPING_FILE = join(
CFG_CACHEDIR, "inspirehep-names-mapping.json")
ns = {"x": "http://www.loc.gov/MARC21/slim"} # XML namespaces


def get_records(record_limit=250):
"""Get MARCXML record elements.

:param int record_limit: records limit each request. Maximum 251,
except if you are a superadmin
:return: list of MARCXML record elements or empty list
"""
counter = 1
records_all = []

url = (
"https://inspirehep.net/search?cc=HepNames"
"&p=035__9%3ABAI+035__%3AINSPIRE&of=xm&ot=035&rg={0}&jrec={1}")

while 1:
req = urllib2.Request(url.format(record_limit, counter))
try:
response = urllib2.urlopen(req)
except urllib2.URLError as e:
raise e
page_result = response.read()
root = ET.fromstring(page_result)
records = root.findall(".//x:record", namespaces=ns) or []

if not records:
break

records_all = records_all + records
counter += record_limit

# Sleep some seconds between every request not to be banned
time.sleep(3)

return records_all


def get_mapping(inspire_records):
"""Get mapping INSPIRE-ID to INSPIRE-HEP-name.

:param list inspire_records: list of MARCXML record elements
:return: dictionary containing the mapping
"""
inspire_mapping = {}

for record in inspire_records:
inspire_id = inspire_name = None

datafields = record.findall("x:datafield[@tag='035']", namespaces=ns)
for datafield in datafields:
subfield = datafield.find("x:subfield[@code='9']", namespaces=ns)
if subfield is not None:
subfield_a = datafield.find(
"x:subfield[@code='a']", namespaces=ns)
if subfield_a is not None:
if (subfield.text == "INSPIRE"):
inspire_id = subfield_a.text
elif (subfield.text == "BAI"):
inspire_name = subfield_a.text

inspire_mapping[inspire_id] = inspire_name

return inspire_mapping


def bst_inspirehep_names_mapper(json_file=INSPIREHEP_NAMES_MAPPING_FILE):
"""Map INSPIRE-IDs to INSPIRE-HEP-names and write to JSON.

:param filepath json_file: path to JSON file containing the INSPIRE mapping
"""
try:
records = get_records()
write_message("{0} records (HepNames) fetched.".format(len(records)))
mapping = get_mapping(records)
if mapping:
version_file(json_file, 1)
try:
export_json(mapping, json_file)
write_message(
"Mapping for INSPIRE-HEP-ids and -names exported to JSON. "
"See '{0}'.".format(json_file))
except UtilsError as e:
write_message(e.reason, stderr)
except urllib2.URLError as e:
write_message(e.reason, stderr)
2 changes: 1 addition & 1 deletion modules/webaccess/doc/hacking/webaccess-api.webdoc
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
<pre>
Invenio Access Control Engine can be called from within your Python programs
via both a regular Python API and CLI.
In addition the you get an explanation of the program flow.
In addition to the above features, you also get an explanation of the program flow.

Contents:
1. Regular API
Expand Down
13 changes: 10 additions & 3 deletions modules/websearch/lib/search_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,8 @@
InvenioWebSearchReferstoLimitError, \
InvenioWebSearchCitedbyLimitError, \
CFG_WEBSEARCH_IDXPAIRS_FIELDS,\
CFG_WEBSEARCH_IDXPAIRS_EXACT_SEARCH
CFG_WEBSEARCH_IDXPAIRS_EXACT_SEARCH, \
CFG_WEBSEARCH_BLACKLISTED_FORMATS
from invenio.search_engine_utils import (get_fieldvalues,
get_fieldvalues_alephseq_like,
record_exists)
Expand Down Expand Up @@ -1711,14 +1712,20 @@ def get_synonym_terms(term, kbr_name, match_type, use_memoise=False):
return dterms.keys()


def wash_output_format(ouput_format):
def wash_output_format(ouput_format, verbose=False, req=None):
"""Wash output format FORMAT. Currently only prevents input like
'of=9' for backwards-compatible format that prints certain fields
only. (for this task, 'of=tm' is preferred)"""
if str(ouput_format[0:3]).isdigit() and len(ouput_format) != 6:
# asked to print MARC tags, but not enough digits,
# so let's switch back to HTML brief default
return 'hb'
elif format in CFG_WEBSEARCH_BLACKLISTED_FORMATS:
if verbose:
write_warning("Selected format is not available through perform_request_search", req=req)
# Returning an empty list seems dangerous because you wouldn't know
# right away that the list is not supposed to be empty.
return 'hb'
else:
return ouput_format

Expand Down Expand Up @@ -5714,7 +5721,7 @@ def prs_wash_arguments(req=None, cc=CFG_SITE_NAME, c=None, p="", f="", rg=CFG_WE
"""

# wash output format:
of = wash_output_format(of)
of = wash_output_format(of, verbose=verbose, req=req)

# wash all arguments requiring special care
p = wash_pattern(p)
Expand Down
5 changes: 5 additions & 0 deletions modules/websearch/lib/search_engine_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@
# interfaces (0=simple, 1=advanced, 2=add-to-search):
CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES = [0,1,2]

# CFG_WEBSEARCH_BLACKLISTED_FORMATS -- list of formats that will be refused
# by perform_request_search:
# * recstruct is an internal format thus should not be exposed
CFG_WEBSEARCH_BLACKLISTED_FORMATS = ["recstruct", "wapaff", "wapdat"]


class InvenioWebSearchUnknownCollectionError(Exception):
"""Exception for bad collection."""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def test_async_download(self):
## - test 1 bad IP: 1.2.3.4
## Return the list of errors.
checks = [
{'url': 'http://invenio-software.org', 'content': 'About Invenio'},
{'url': 'http://invenio-software.org', 'content': 'Invenio'},
{'url': 'http://rjfreijoiregjreoijgoirg.fr'},
{'url': 'http://1.2.3.4/'}]

Expand Down
8 changes: 6 additions & 2 deletions modules/websubmit/lib/functions/Stamp_Uploaded_Files.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,8 +302,8 @@ def visit_for_stamping(visit_for_stamping_arguments, dirname, filenames):
'file_stamper_options' members.
@param dirname: (string) - the path to the directory in which the
files are to be stamped.
@param filenames: (list) - the names of each file in dirname. An
attempt will be made to stamp each of these files.
@param filenames: (list) - the names of each file and subdirectory in
dirname. An attempt will be made to stamp each of the files.
@Exceptions Raised:
+ InvenioWebSubmitFunctionWarning;
+ InvenioWebSubmitFunctionError;
Expand Down Expand Up @@ -345,6 +345,10 @@ def visit_for_stamping(visit_for_stamping_arguments, dirname, filenames):
path_to_subject_file = "%s/%s" % (dirname, file_to_stamp)
file_stamper_options['input-file'] = path_to_subject_file

if not os.path.isfile(path_to_subject_file):
# If it's not a file, we can't stamp it. Continue with next file
continue

## Just before attempting to stamp the file, log the dictionary of
## options (file_stamper_options) that will be passed to websubmit-
## file-stamper:
Expand Down