diff --git a/modules/bibauthority/bin/Makefile.am b/modules/bibauthority/bin/Makefile.am index d9584c60ef..066ee903ec 100644 --- a/modules/bibauthority/bin/Makefile.am +++ b/modules/bibauthority/bin/Makefile.am @@ -1,6 +1,5 @@ -# # This file is part of Invenio. -# Copyright (C) 2011 CERN. +# Copyright (C) 2011, 2012, 2013, 2014, 2015 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as @@ -16,4 +15,8 @@ # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. +bin_SCRIPTS = bibauthority_people + +EXTRA_DIST = bibauthority_people.in + CLEANFILES = *~ *.tmp diff --git a/modules/bibauthority/bin/bibauthority_people b/modules/bibauthority/bin/bibauthority_people new file mode 100644 index 0000000000..422dbf04a6 --- /dev/null +++ b/modules/bibauthority/bin/bibauthority_people @@ -0,0 +1,166 @@ +#!/usr/bin/env python + +# This file is part of Invenio. +# Copyright (C) 2015 CERN. +# +# Invenio is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# Invenio is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Invenio; if not, write to the Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + +"""BibAuthority CERN people collection command line interface. + +Map all CERN LDAP records to MARC 21 authority records, write to XML files and +upload to CDS. + +See bibsched tasklet bst_bibauthority_updater for updating the collection. + +Usage: +$python bibauthority_people -h +""" + +import argparse +import sys +from glob import glob +from os.path import isdir, join + +from invenio.bibauthority_people_config import ( + CFG_BIBAUTHORITY_LDAP_ATTRLIST, CFG_BIBAUTHORITY_LDAP_SEARCHFILTER) +from invenio.bibauthority_people_mapper import Mapper, MapperError +from invenio.bibauthority_people_utils import ( + bibupload, export_json, get_data_from_json, UtilsError) +from invenio.ldap_cern import get_users_records_data, LDAPError + + +def load_json(parser, json_file): + """Return data from JSON file.""" + try: + return get_data_from_json(json_file) + except UtilsError as e: + parser.error(e) + + +def get_records(ldap_searchfilter=CFG_BIBAUTHORITY_LDAP_SEARCHFILTER, + ldap_attrlist=CFG_BIBAUTHORITY_LDAP_ATTRLIST): + """Return user records from LDAP.""" + records = [] + try: + records = get_users_records_data( + ldap_searchfilter, ldap_attrlist, "utf-8") + except LDAPError as e: + print(e) + return records + +usage = ("bibauthority_people.py [-h] [[-r RECORDSIZE] [-x FILE [-l FILE] " + "[-j FILE]]] [-i FILE [FILE ...]] [-c]") + +parser = argparse.ArgumentParser( + description="Command line interface for the CERN people collection. Map " + "all CERN LDAP records to MARC 21 authority records, write " + "to XML files and upload to CDS.", + usage=usage) + +group1 = parser.add_argument_group("Export") +group2 = parser.add_argument_group("Insertion to CDS") +group3 = parser.add_argument_group("Information") + +group1.add_argument( + "-r", + "--recordsize", + dest="recordsize", + type=int, + default=500, + help="limit number of record elements for each XML file and has to be " + "used together with '-x' [default: %(default)d]. For unlimited " + "records use 0") +group1.add_argument( + "-x", + "--exportxml", + dest="exportxml", + type=str, + metavar="FILE", + help="export mapped CERN LDAP records to XML FILE(s). Number of records " + "each FILE is based on RECORDSIZE") +group1.add_argument( + "-m", + "--mapping", + dest="inspireids", + type=lambda f: load_json(parser, f), + metavar="FILE", + help="mapping dictionary {'CERN-ID': 'Inspire-ID', ...} stored in FILE " + "used for mapping the CERN-ID to Inspire-ID instead of using " + "ATLAS GLANCE. Works together with '-x'") +group1.add_argument( + "-j", + "--exportjson", + dest="exportjson", + type=str, + metavar="FILE", + help="export CERN LDAP records to a JSON-formatted FILE, recommended " + "using it together with '-x'") +group2.add_argument( + "-i", + "--insert", + dest="insert", + type=str, + nargs="+", + metavar="FILE", + help="insert XML FILE(s) to CDS using inveio.bibupload -i. '-i DIRECTORY' " + "will upload all XML files in the given DIRECTORY") +group3.add_argument( + "-c", + "--count", + dest="count", + action="store_true", + help="count all primary CERN LDAP records") + +args = parser.parse_args() + +if args.exportxml or args.exportjson: + records = get_records() + print("{0} records fetched from CERN LDAP".format(len(records))) + +if args.exportxml: + try: + mapper = Mapper(mapping_inspire_ids=args.inspireids or None) + mapper.map_ldap_records(records) + mapper.write_marcxml(args.exportxml, args.recordsize) + except MapperError as e: + sys.stderr.write(e) + sys.exit(1) + +if args.exportjson: + try: + export_json(records, args.exportjson) + except UtilsError as e: + sys.stderr.write(e) + sys.exit(1) + +if args.insert: + if args.insert: + arg0 = args.insert[0] + # Upload XML files (located in directory arg0) to CDS + if isdir(arg0): + for f in glob(join(arg0, "*.xml")): + task_id = bibupload(f, "-i", "bibauthority-people-insert") + else: + for f in args.insert: + task_id = bibupload(f, "-i", "bibauthority-people-insert") + if task_id: + print("Task (identifier: {0}) is correctly enqueued".format( + task_id)) + else: + print("Error: failed to enqueue task") + +if args.count: + records = get_records(ldap_attrlist=['employeeID']) + print("{0} records found on CERN LDAP".format(len(records))) diff --git a/modules/bibauthority/lib/bibauthority_people_config.py b/modules/bibauthority/lib/bibauthority_people_config.py new file mode 100644 index 0000000000..ae4c588b4b --- /dev/null +++ b/modules/bibauthority/lib/bibauthority_people_config.py @@ -0,0 +1,80 @@ +# This file is part of Invenio. +# Copyright (C) 2015 CERN. +# +# Invenio is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# Invenio is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Invenio; if not, write to the Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + +"""BibAuthority CERN people collection configuration file.""" + +from os.path import join + +from invenio.config import CFG_CACHEDIR, CFG_TMPSHAREDDIR +from invenio.containerutils import LazyDict +from invenio.websubmit_author_sources.atlas_glance import query_author_source + + +def _atlas_glance_to_inspire_id(): + return dict( + (author["cernccid"], author["inspireid"]) for + author in query_author_source("") if + author.get("cernccid") and author.get("inspireid")) + + +# LDAP search filter +CFG_BIBAUTHORITY_LDAP_SEARCHFILTER = \ + r"(&(objectClass=*)(employeeType=Primary))" + +# LDAP attribute list +# bibauthority_people_mapper contains the same attributes +CFG_BIBAUTHORITY_LDAP_ATTRLIST = [ + "employeeID", + "givenName", + "sn", + "displayName", + "facsimileTelephoneNumber", + "telephoneNumber", + "mobile", + "mail", + "department", + "cernGroup", + "description", + "division", + "cernInstituteName", + "extensionAttribute11"] + +# Stores CERN LDAP records +CFG_BIBAUTHORITY_RECORDS_JSON_FILE = join(CFG_CACHEDIR, "records.json") + +# Stores updated MARC 21 authority records +CFG_BIBAUTHORITY_RECORDS_UPDATES_FILE = join( + CFG_TMPSHAREDDIR, "records_updates.xml") + +# Prefix +CFG_BIBAUTHORITY_AUTHOR_CDS = "AUTHOR|(CDS)" + +# Prefix used in MARC field 035__a +CFG_BIBAUTHORITY_AUTHOR_CERN = "AUTHOR|(SzGeCERN)" + +# Prefix used in MARC field 035__a +CFG_BIBAUTHORITY_AUTHOR_INSPIRE = "AUTHOR|(INSPIRE)" + +# Dictionary containing the mapping CERN-ID: Inspire-ID +CFG_BIBAUTHORITY_ATLAS_GLANCE_CERN_ID_TO_INSPIRE_ID_MAPPING = LazyDict( + _atlas_glance_to_inspire_id) + +# Send email if duplicate Inspire-IDs found +CFG_BIBAUTHORITY_ATLAS_GLANCE_EMAIL_FROM = "noreply@cern.ch" + +# Send email if duplicate Inspire-IDs found +CFG_BIBAUTHORITY_ATLAS_GLANCE_EMAIL_TO = "cds-monitoring@cern.ch" diff --git a/modules/bibauthority/lib/bibauthority_people_mapper.py b/modules/bibauthority/lib/bibauthority_people_mapper.py new file mode 100644 index 0000000000..206aaa8292 --- /dev/null +++ b/modules/bibauthority/lib/bibauthority_people_mapper.py @@ -0,0 +1,315 @@ +# This file is part of Invenio. +# Copyright (C) 2015 CERN. +# +# Invenio is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# Invenio is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Invenio; if not, write to the Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + +""""BibAuthority CERN people collection mapper. + +Map CERN LDAP records to MARC 21 authority records, create and +write XML objects based on the MARC 21 XML Schema (MARCXML). +""" + +from collections import OrderedDict +from datetime import date +from os import makedirs +from os.path import dirname, exists, splitext + +from invenio.bibauthority_people_config import ( + CFG_BIBAUTHORITY_AUTHOR_CERN, CFG_BIBAUTHORITY_AUTHOR_INSPIRE) +from invenio.bibauthority_people_utils import get_inspire_id + +from lxml import etree + + +class MapperError(Exception): + + """Base class for exceptions in this module.""" + + pass + + +class Mapper: + + """Map CERN LDAP records to MARC 21 authority records (MARCXML). + + MARC 21 authority reference: http://www.loc.gov/marc/authority/ + """ + + def __init__(self, mapping_inspire_ids=None): + """Initialize the mapper properties.""" + self.roots = [] # Contain all root elements + self.records = [] # Contain all (mapped) MARC records + # Mapping rules: LDAP attributes to MARC 21 authority + # LDAP attributes used in config.CFG_LDAP_ATTRLIST + self.mapper_dict = { + "employeeID": "035__a", + "givenName": "1000_a", + "sn": "1001_a", + "displayName": "100__a", + "facsimileTelephoneNumber": "371__f", + "telephoneNumber": "371__k", + "mobile": "371__l", + "mail": "371__m", + "department": "371__d", + "cernGroup": "371__g", + "description": "371__h", + "division": "371__i", + "cernInstituteName": "371__0", + "extensionAttribute11": "371__1" + } + self.mapping_inspire_ids = mapping_inspire_ids + + def _split_marc_id(self, marc_id): + """Split MARC 21 identifier which is defined in the mapper_dict. + + :param string marc_id: format: fffiis, + f = Field Code (code); + i = First Indicator (ind1), Second Indicator (ind2); + s = Subfield Codes (subfield_code, optional) + :return: code, ind1, ind2, and subfield_code + """ + try: + subfield_code = marc_id[5] + except IndexError: + subfield_code = None + + return marc_id[:3], marc_id[3], marc_id[4], subfield_code + + def _create_root(self): + """Create root element 'collection' with the attribute 'xmlns'. + + :return: root element + """ + return etree.Element( + "collection", {"xmlns": "http://www.loc.gov/MARC21/slim"}) + + def _create_record(self, parent=None): + """Create record element. + + :param elem parent: parent of record element (optional) + :return: record element + """ + if parent: + elem_record = etree.SubElement(parent, "record") + else: + elem_record = etree.Element("record") + return elem_record + + def _create_controlfield(self, parent, attr_code, inner_text=None): + """Create child element 'controlfield' of parent. + + :param elem parent: parent element, usually 'collection' + :param string attr_code: value for attribute 'code' + :param string inner_text: inner text of element + :return: controlfield element, child of parent + """ + controlfield = etree.SubElement(parent, "controlfield", { + "tag": attr_code}) + if inner_text: + controlfield.text = inner_text + return controlfield + + def _create_datafield(self, parent, marc_id, value, repeatable=False): + """Create child element 'datafield' (including 'subfield') of parent. + + :param elem parent: parent element, usually 'record' + :param string marc_id: full MARC-id, example: 035__a + :param string value: value stored in subfield + :param bool repeatable: allows multiple datafields with same codes + :return: either new or existing datafield element (depending on + repeatable), child of parent + """ + marc_code, marc_ind1, marc_ind2, marc_subfield_code = ( + self._split_marc_id(marc_id)) + + # Modify indicators + if marc_ind1 == "_": + marc_ind1 = " " + elif marc_ind1.isalpha(): + marc_ind1 = marc_ind1.upper() + if marc_ind2 == "_": + marc_ind2 = " " + elif marc_ind2.isalpha(): + marc_ind2 = marc_ind2.upper() + + find = parent.xpath( + "datafield[@tag={0} and @ind1='{1}' and @ind2='{2}']".format( + marc_code, marc_ind1, marc_ind2)) + if not find or repeatable: + elem_datafield = etree.SubElement( + parent, + "datafield", + OrderedDict({ + "tag": marc_code, "ind1": marc_ind1, "ind2": marc_ind2})) + else: + elem_datafield = find[0] + + # Create subfield with value and append to datafield + subfield = etree.SubElement( + elem_datafield, "subfield", {"code": marc_subfield_code}) + subfield.text = value + + return elem_datafield + + def _attach_records(self, record_size=500): + """Attach record elements to root element(s). + + :param int record_size: record child elements in a root node, + if <= 0: append all records to one root node + :return : list of root elements + """ + # Append all record elements to one root element + if record_size <= 0: + record_size = -1 + + # Append records to root element(s) depending on record_size + if self.records: + current_root = self._create_root() + self.roots.append(current_root) + record_size_counter = 0 + + for record in self.records: + if record_size_counter == record_size: + current_root = self._create_root() + self.roots.append(current_root) + record_size_counter = 0 # reset counter + record_size_counter += 1 + current_root.append(record) + + return self.roots + + def map_ldap_record(self, record): + """Map LDAP record to MARC 21 authority record (XML). + + :param dictionary record: LDAP record (result-data) + :param filepath file: look up Inspire-ID at + ATLAS GLANCE (False) or local directory (True) + :return: record element + """ + elem_record = self._create_record() + + # Map each LDAP attribute for one record to a datafield + for attr_key in self.mapper_dict.keys(): + marc_id = self.mapper_dict.get(attr_key) + + value = record.get(attr_key) + if value: + value = value[0] + + # Modify employeeID to valid system control number + if attr_key == "employeeID": + value = "{0}{1}".format( + CFG_BIBAUTHORITY_AUTHOR_CERN, value) + + self._create_datafield(elem_record, marc_id, value) + + # Additional repeatable datafields for collections + self._create_datafield(elem_record, "371__v", "CERN LDAP") + self._create_datafield(elem_record, "690C_a", "CERN") + self._create_datafield(elem_record, "980__a", "PEOPLE", True) + self._create_datafield(elem_record, "980__a", "AUTHORITY", True) + + # Add Inspire-ID (if exists for employeeID) to record + employee_id = record.get('employeeID') + if employee_id: + if self.mapping_inspire_ids: + inspire_id = get_inspire_id( + employee_id[0], self.mapping_inspire_ids) + else: + inspire_id = get_inspire_id(employee_id[0]) + if inspire_id: + # Modify inspire_id to valid system control number + inspire_id = "{0}{1}".format( + CFG_BIBAUTHORITY_AUTHOR_INSPIRE, inspire_id) + self._create_datafield(elem_record, "035__a", inspire_id, True) + + return elem_record + + def map_ldap_records(self, records): + """Map LDAP records. + + :param list records: list of LDAP records (result-data) + :return: list of record elements + """ + for record in records: + self.records.append(self.map_ldap_record(record)) + + return self.records + + def update_ldap_records(self, records): + """Map updated LDAP records. + + :param list records: list of tuples (status, record), where status is + 'add', 'remove', or 'change' + :return: list of record XML elements + """ + for record in records: + elem_record = self.map_ldap_record(record[1]) + + # Add datafields for removed records + if record[0] == "remove": + self._create_datafield( + elem_record, "595__a", "REMOVED FROM SOURCE") + self._create_datafield( + elem_record, "595__c", date.today().strftime("%Y-%m-%d")) + + self.records.append(elem_record) + + return self.records + + def _write_xml(self, tree, xml_file): + """Write tree to XML file. + + :param etree tree: XML tree to write + :param filepath xml_file: XML file to write to + """ + try: + with open(xml_file, "w") as f: + f.write(tree) + except EnvironmentError as e: + raise MapperError("Error: failed writing file. ({0})".format(e)) + + def write_marcxml(self, xml_file, record_size=500): + """Prepare to write self.roots to a single file or multiple files. + + :param int record_size: record elements in a root node [default: 500], + if <= 0: append all records to one root node + :param filepath xml_file: save to file, + suffix ('_0', '_1', ...) will be added to file name + """ + directory = dirname(xml_file) + if directory is not "" and not exists(directory): + makedirs(directory) + + self._attach_records(record_size) + + try: + # Write single file + if record_size <= 0: + self._write_xml( + etree.tostring( + self.roots[0], encoding='utf-8', pretty_print=True), + xml_file) + # Write multiple files + else: + filename, ext = splitext(xml_file) + for i, root in enumerate(self.roots): + f = "{0}_{1}{2}".format(filename, i, ext) + self._write_xml( + etree.tostring( + root, encoding='utf-8', pretty_print=True), + f) + except MapperError: + raise diff --git a/modules/bibauthority/lib/bibauthority_people_utils.py b/modules/bibauthority/lib/bibauthority_people_utils.py new file mode 100644 index 0000000000..b70cb17699 --- /dev/null +++ b/modules/bibauthority/lib/bibauthority_people_utils.py @@ -0,0 +1,205 @@ +# This file is part of Invenio. +# Copyright (C) 2015 CERN. +# +# Invenio is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# Invenio is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Invenio; if not, write to the Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + +"""BibAuthority CERN people collection various of utility functions.""" + +from json import dump, load +from os import listdir, makedirs, remove +from os.path import dirname, exists, isfile, realpath, splitext +from re import escape, match +from shutil import copyfile +from time import time + +from invenio.bibauthority_people_config import ( + CFG_BIBAUTHORITY_ATLAS_GLANCE_CERN_ID_TO_INSPIRE_ID_MAPPING, + CFG_BIBAUTHORITY_ATLAS_GLANCE_EMAIL_FROM, + CFG_BIBAUTHORITY_ATLAS_GLANCE_EMAIL_TO) +from invenio.bibtask import task_low_level_submission +from invenio.mailutils import send_email + + +class UtilsError(Exception): + + """Base class for exceptions in this module.""" + + pass + + +def bibupload(xml_file, cmd_options, name, priority="-1"): + """Upload file to CDS using invenio.bibtask.task_low_level_submission. + + :param filepath xml_file: XML file (MARC 21 XML Schema) containing records + :param string cmd_options: see task_low_level_submission command options + :param string name: task specific name + :param string priority: task priority (0=default, 1=higher, etc) + :return: task id, None if file not found + """ + task_id = None + if isfile(xml_file): + task_id = task_low_level_submission( + "bibupload", # Name + "bibauthority-people", # User + cmd_options, xml_file, + "-P", priority, + "-N", name) + return task_id + + +def get_inspire_id( + employee_id, + mapping=CFG_BIBAUTHORITY_ATLAS_GLANCE_CERN_ID_TO_INSPIRE_ID_MAPPING): + """Get the Inspire-ID, given the CERN-ID (employeeID). + + :param string employee_id: CERN-ID (employeeID) + :param dict mapping: employeeID: Inspire-ID + :return: Inspire-ID or None + """ + inspire_id = mapping.get(str(employee_id)) + if inspire_id: + if mapping.values().count(inspire_id) > 1: + # Duplicate inspire_id found - inform the admin + email_subject = ( + "Duplicate Inspire-ID: {0}".format(inspire_id)) + email_content = ( + "Duplicate Inspire-ID ({0}) found when looking for " + "CERN-ID ({1}) on ATLAS Glance." + .format(inspire_id, employee_id)) + send_email( + CFG_BIBAUTHORITY_ATLAS_GLANCE_EMAIL_FROM, + CFG_BIBAUTHORITY_ATLAS_GLANCE_EMAIL_TO, + email_subject, + email_content) + inspire_id = None + return inspire_id + + +def get_data_from_json(json_file): + """Get data from JSON file. + + :param filepath json_file: path to JSON file containing records + :return: python object + """ + try: + with open(json_file) as f: + try: + return load(f) + except ValueError as e: + raise UtilsError( + "Error: failed loading records from file. ({0})".format(e)) + except EnvironmentError as e: + raise UtilsError( + "Error: failed opening file '{0}'. ({1})".format(json_file, e)) + + +def diff_records(records_ldap, records_local): + """Compare records with same employeeID. + + Records are classified in three classes: changed ('change'), new + ('add'), and removed ('remove') records. + + :param list records_ldap: fetched CERN LDAP records + :param list records_local: previous fetched CERN LDAP records, + saved as a JSON file + :return: list of updated records (tuple: (status, record)), where + status = 'change', 'add', or 'remove', or empty list + """ + results = [] + + try: + # Transform list of records to dictionary: + # [{record}, ...] -> {'employeeID': {record}, ...} + dict_ldap = dict((x.get('employeeID')[0], x) for x in records_ldap) + dict_local = dict((x.get('employeeID')[0], x) for x in records_local) + + for employee_id in dict_ldap.iterkeys(): + record = dict_ldap.get(employee_id) + if employee_id in dict_local: + if not record == dict_local.get(employee_id): + # Changed record + results.append(('change', record)) + else: + # New record + results.append(('add', record)) + + for employee_id in dict_local.iterkeys(): + if employee_id not in dict_ldap: + # Deleted record + record = dict_local.get(employee_id) + results.append(('remove', record)) + + except (Exception,) as e: + raise UtilsError("{0}".format(e)) + + return results + + +def version_file(src, n=10): + """Version src file. + + Copy src with adding the timestamp as suffix, keep the latest n + files, and remove the rest. + + :param filepath src: src file to copy + :param int n: keep the n latest files + """ + if not isfile(src): + return + + n = 1 if n < 1 else n + + filename, ext = splitext(src) + directory = dirname(realpath(src)) + + # 1: Copy src + timestamp = int(time()) + dst = "{0}_{1}{2}".format(filename, timestamp, ext) + copyfile(src, dst) + + # 2: Remove files + # RegEx matching files with format "_" + regex = r"^" + escape("{0}_".format(filename)) + "\d+" + escape(ext) + "$" + # Sorted list of files in 'directory' matching 'regex' + files = sorted([f for f in listdir(directory) if match(regex, f)]) + for f in files[:len(files)-n]: + try: + remove(f) + except OSError as e: + raise UtilsError( + "Error: failed removing file '{0}'. ({1})".format(f, e)) + + +def export_json(records, json_file): + """Export records to file using json.dump. + + :param list records: list of records + :param filepath json_file: path to JSON file containing records + """ + directory = dirname(json_file) + if directory is not "" and not exists(directory): + makedirs(directory) + + try: + with open(json_file, "w") as f: + try: + dump(records, f) + except ValueError as e: + raise UtilsError( + "Error: failed dumping records to JSON. ({0})" + .format(e)) + except EnvironmentError as e: + raise UtilsError( + "Error: failed opening file. ({0})".format(e)) diff --git a/modules/bibsched/lib/tasklets/Makefile.am b/modules/bibsched/lib/tasklets/Makefile.am index 8d9caa81a0..ff3426d083 100644 --- a/modules/bibsched/lib/tasklets/Makefile.am +++ b/modules/bibsched/lib/tasklets/Makefile.am @@ -1,5 +1,5 @@ # This file is part of Invenio. -# Copyright (C) 2010, 2011, 2012, 2014 CERN. +# Copyright (C) 2010, 2011, 2012, 2014, 2015 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as @@ -18,8 +18,8 @@ pylibdir=$(libdir)/python/invenio/bibsched_tasklets pylib_DATA = __init__.py bst_fibonacci.py bst_send_email.py bst_twitter_fetcher.py bst_run_bibtask.py \ - bst_notify_url.py bst_weblinkback_updater.py \ - bst_create_icons.py bst_create_related_formats.py + bst_notify_url.py bst_weblinkback_updater.py bst_bibauthority_people_updater.py \ + bst_create_icons.py bst_create_related_formats.py EXTRA_DIST = $(pylib_DATA) diff --git a/modules/bibsched/lib/tasklets/bst_bibauthority_people_updater.py b/modules/bibsched/lib/tasklets/bst_bibauthority_people_updater.py new file mode 100644 index 0000000000..30ad63e408 --- /dev/null +++ b/modules/bibsched/lib/tasklets/bst_bibauthority_people_updater.py @@ -0,0 +1,105 @@ +# This file is part of Invenio. +# Copyright (C) 2015 CERN. +# +# Invenio is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# Invenio is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Invenio; if not, write to the Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + +"""BibAuthority CERN people collection updater (Invenio Bibliographic Tasklet). + +The collection is based on data fetched from CERN LDAP, including the +Inspire-ID from ATLAS GLANCE. + +Usage: +$bibtasklet -N bibauthority-people + -T bst_bibauthority_people_updater [-a json_file + [default: invenio.bibauthority_people_config.CFG_RECORDS_JSON_FILE]] +""" + +from sys import stderr + +from invenio.bibauthority_people_config import ( + CFG_BIBAUTHORITY_LDAP_ATTRLIST, CFG_BIBAUTHORITY_LDAP_SEARCHFILTER, + CFG_BIBAUTHORITY_RECORDS_JSON_FILE, CFG_BIBAUTHORITY_RECORDS_UPDATES_FILE) +from invenio.bibauthority_people_mapper import Mapper, MapperError +from invenio.bibauthority_people_utils import ( + bibupload, diff_records, export_json, get_data_from_json, UtilsError, + version_file) +from invenio.bibtask import write_message +from invenio.ldap_cern import get_users_records_data, LDAPError + + +def update(records_updates): + """Map updated records in record_diff and upload to CDS. + + :param list records_updates: list of tuples (status, record), where status + is 'add', 'remove', or 'change' + """ + write_message("{0} updated record(s) detected".format( + len(records_updates))) + if records_updates: + try: + # Map updated records + mapper = Mapper() + mapper.update_ldap_records(records_updates) + + # Write updates to XML + mapper.write_marcxml(CFG_BIBAUTHORITY_RECORDS_UPDATES_FILE, 0) + + # Upload updates to CDS using --replace and --insert + task_id = bibupload(CFG_BIBAUTHORITY_RECORDS_UPDATES_FILE, "-ri", + "bibauthority-people-update") + if task_id: + write_message( + "Task (identifier: {0}) is correctly enqueued".format( + task_id)) + else: + write_message("Error: failed to enqueue task", + stderr) + except MapperError as e: + write_message(e, stderr) + + +def bst_bibauthority_people_updater( + json_file=CFG_BIBAUTHORITY_RECORDS_JSON_FILE): + """Update CDS records with current CERN LDAP records. + + :param filepath json_file: path to JSON file containing records + """ + try: + # Fetch CERN LDAP records + records_ldap = get_users_records_data( + CFG_BIBAUTHORITY_LDAP_SEARCHFILTER, + CFG_BIBAUTHORITY_LDAP_ATTRLIST, + "utf-8") + write_message("{0} records fetched from CERN LDAP".format( + len(records_ldap))) + try: + records_local = get_data_from_json(json_file) + + # records_diff contains updated records (changed, added, or + # removed on LDAP) + records_diff = diff_records(records_ldap, records_local) + + try: + update(records_diff) + # Version existing file + version_file(json_file) + # Update local file with current LDAP records + export_json(records_ldap, json_file) + except UtilsError as e: + write_message(e, stderr) + except UtilsError as e: + write_message(e, stderr) + except LDAPError as e: + write_message(e, stderr)