diff --git a/.github/workflows/pypi-publish.yml b/.github/workflows/pypi-publish.yml
index 8ba42b45..1533bf95 100644
--- a/.github/workflows/pypi-publish.yml
+++ b/.github/workflows/pypi-publish.yml
@@ -10,16 +10,16 @@ jobs:
runs-on: ubuntu-20.04
steps:
- name: Checkout
- uses: actions/checkout@v2
+ uses: actions/checkout@v3
- name: Set up Python
- uses: actions/setup-python@v2
+ uses: actions/setup-python@v3
with:
- python-version: 2.7
+ python-version: 3.6
- name: Install dependencies
run: |
- python -m pip install --upgrade "pip>=20,<21" "setuptools>=40,<46" wheel
+ python -m pip install --upgrade "pip==21" "setuptools==40" wheel
- name: Build package
# Remove `compile_catalog` if the package has no translations.
run: |
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index ed8af9ef..d0067a05 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -24,27 +24,27 @@ jobs:
runs-on: ubuntu-20.04
strategy:
matrix:
- #python-version: [2.7, 3.6]
- python-version: [2.7]
- requirements-level: [min, pypi]
+ python-version: [3.6]
+ #python-version: [2.7]
+ requirements-level: [pypi]
steps:
- name: Checkout
- uses: actions/checkout@v2
+ uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v2
+ uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}
- name: Generate dependencies
run: |
- python -m pip install --upgrade "pip>=20,<21" "setuptools>=40,<46" py
+ python -m pip install --upgrade "pip==21" "setuptools==40" py
python -m pip install wheel coveralls requirements-builder configparser
requirements-builder --level=${{ matrix.requirements-level }} setup.py > .${{ matrix.requirements-level }}-${{ matrix.python-version }}-requirements.txt
- name: Cache pip
- uses: actions/cache@v2
+ uses: actions/cache@v3
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-${{ hashFiles('.${{ matrix.requirements-level }}-${{ matrix.python-version }}-requirements.txt') }}
diff --git a/cds_dojson/marc21/fields/base.py b/cds_dojson/marc21/fields/base.py
index a1a9e481..52f14288 100644
--- a/cds_dojson/marc21/fields/base.py
+++ b/cds_dojson/marc21/fields/base.py
@@ -24,7 +24,7 @@
ignore_value)
from ..models.base import model
-from .utils import build_contributor, build_contributor_from_508
+from .utils import build_contributor, build_contributor_from_508, build_contributor_from_906
@model.over('recid', '^001')
@@ -54,14 +54,22 @@ def report_number(self, key, value):
return rn
-@model.over('contributors', '^(100|700|508)__')
+@model.over('contributors', '^(100|700|508|906)__')
def contributors(self, key, value):
"""Contributors."""
authors = self.get('contributors', [])
if key in ['100__', '700__']:
items = build_contributor(value)
- else:
+ elif key == '508__':
items = build_contributor_from_508(value)
+ else:
+ items = build_contributor_from_906(value)
+ if 'contributors' in self.keys():
+ names = [dic['name'] for dic in self['contributors']]
+ roles = [dic['role'] for dic in self['contributors']]
+ if items[0]['name'] in names:
+ if items[0]['role'] == roles[names.index(items[0]['name'])]:
+ items = None
# add only contributors that are not part of the authors
if items:
authors.extend(
@@ -87,6 +95,10 @@ def translations(self, key, value):
translation = self.get('translations', [{}])[0]
if key.startswith('246'):
translation['title'] = {'title': value.get('a')}
+ if value.get('n'):
+ translation['description'] = value.get('n')
+ if value.get('p'):
+ translation['title']['subtitle'] = value.get('p')
if key.startswith('590'):
translation['description'] = value.get('a')
translation['language'] = 'fr'
@@ -94,10 +106,32 @@ def translations(self, key, value):
raise IgnoreKey('translations')
-@model.over('description', '^520__')
+@model.over('description', '(^511__)|(^5111_)|(^518__)|(^520__)')
def description(self, key, value):
"""Description."""
- return value.get('a')
+ if key == '511__' or key == '5111_':
+ if value.get('a'):
+ return 'Filmed people: ' + value.get('a')
+ elif value.get('1'):
+ return 'Filmed people: ' + value.get('1')
+ return ''
+
+ if key == '518__':
+ if value.get('a'):
+ if 'description' in self.keys():
+ return self['description'] + '\nPlace and/or date of event: ' + value.get('a')
+ return value.get('a')
+ if 'description' in self.keys():
+ return self['description']
+ return ''
+
+ if value.get('a'):
+ if 'description' in self.keys():
+ return self['description'] + '\nPlace and/or date of event: ' + value.get('a')
+ return value.get('a')
+ if 'description' in self.keys():
+ return self['description']
+ return ''
@model.over('keywords', '^6531_')
diff --git a/cds_dojson/marc21/fields/utils.py b/cds_dojson/marc21/fields/utils.py
index 5cacc252..7a4529f6 100644
--- a/cds_dojson/marc21/fields/utils.py
+++ b/cds_dojson/marc21/fields/utils.py
@@ -79,6 +79,7 @@ def _get_correct_video_contributor_role(role):
'autor': 'Creator',
'camera': 'Camera Operator',
'camera & sound': 'Camera Operator',
+ 'chairperson': 'Chairperson',
'co-produced by': 'Co-Producer',
'co-production': 'Co-Producer',
'commentaire': 'Comments by',
@@ -119,6 +120,7 @@ def _get_correct_video_contributor_role(role):
'made by': 'Creator',
'montage': 'Editor',
'narrator': 'Narrator',
+ 'organiser': 'Organiser',
'presentator': 'Reporter',
'presented by': 'Reporter',
'presenter': 'Reporter',
@@ -146,6 +148,7 @@ def _get_correct_video_contributor_role(role):
'shooting and editing': ('Camera Operator', 'Editor'),
'son': 'Music by',
'speaker': 'Speaker',
+ 'sponsor': 'Sponsor',
'writen by': 'Screenwriter',
'writer and director': ('Screenwriter', 'Director'),
'written & directed by': ('Screenwriter', 'Director'),
@@ -227,8 +230,16 @@ def build_contributor(value):
# Avoids a few calls
value = get_author_info_from_people_collection(value)
- role = _get_correct_video_contributor_role(
- value.get('e', 'producer')) # always unicode
+ if value.get('e'):
+ role = _get_correct_video_contributor_role(
+ value.get('e', 'producer')) # always unicode
+ else:
+ try:
+ role = _get_correct_video_contributor_role(
+ value.get('g', 'producer')) # always unicode
+ except:
+ role = 'Producer'
+
contributors = []
contributor = {
'ids': _extract_json_ids(value) or None,
@@ -270,3 +281,11 @@ def build_contributor_from_508(value):
return contributors
else:
return build_contributor({'a': item.strip(), 'e': 'credits'})
+
+def build_contributor_from_906(value):
+ """Build contributors from field 508."""
+ contributor = {'name': value.get('p'), 'role': 'Speaker'}
+ if value.get('u'):
+ contributor['affiliations'] = (value.get('u'))
+
+ return [contributor]
\ No newline at end of file
diff --git a/cds_dojson/marc21/fields/videos/video.py b/cds_dojson/marc21/fields/videos/video.py
index 714e7fdf..9180044c 100644
--- a/cds_dojson/marc21/fields/videos/video.py
+++ b/cds_dojson/marc21/fields/videos/video.py
@@ -45,6 +45,23 @@ def duration(self, key, value):
i.e. '2 min.', we will extract it programatically later to avoid the hassle
off dealing with more regex.
"""
+ data = {}
+ data['CERN_ID'] = value.get('2', '')
+ data['res_ar_fps'] = value.get('b', '')
+ data['FPS'] = value.get('c', '')
+ data['resolution'] = value.get('d', '')
+ data['aspect_ratio'] = value.get('e', '')
+
+ empty_keys = [aux_key for aux_key in data.keys() if data[aux_key] == '']
+ for aux_key in empty_keys:
+ data.pop(aux_key)
+
+ if len(data.keys()) > 0:
+ if '_digitization' not in self.keys():
+ self['_digitization'] = [data]
+ else:
+ self['_digitization'].append(data)
+
try:
return re.match(r'(\d{2}:\d{2}:\d{2})(\.\d+)?', value.get('a')) \
.group(1)
@@ -66,6 +83,21 @@ def language(self, key, value):
@model.over('physical_medium', '(^340__)|(^852__)')
def physical_medium(self, key, value):
"""Physical medium."""
+ data = {}
+ data['physical_media_note'] = value.get('h', '')
+ data['has_copy'] = value.get('j', '')
+ data['physical_media_type'] = value.get('x', '')
+
+ empty_keys = [aux_key for aux_key in data.keys() if data[aux_key] == '']
+ for aux_key in empty_keys:
+ data.pop(aux_key)
+
+ if len(data.keys()) > 0:
+ if '_digitization' not in self.keys():
+ self['_digitization'] = [data]
+ else:
+ self['_digitization'].append(data)
+
def find_match(seq, copy):
if not seq and not copy \
and key == '852__' and len(_physical_medium) == 1:
@@ -112,11 +144,123 @@ def find_match(seq, copy):
return [dict((k, v) for k, v in iteritems(i) if v is not None)
for i in _physical_medium]
+@model.over('related_links', '^775__')
+def related_links(self, key, value):
+ """Related links."""
+ related_link = {}
+ if value.get('b') and value.get('w'):
+ if value.get('c'):
+ related_link['name'] = value.get('b') + ' ' + value.get('c')
+ else:
+ related_link['name'] = value.get('b')
+
+ related_link['url'] = 'https://cds.cern.ch/record/' + value.get('w')
+ return related_link
+
+
+@model.over('_digitization', '(^336__)|(^337__)|(^5081_)|(^514__)|(^5831_)|(^583__)|(^594__)|(^597__)|(^65027)|(690C_)|(^7870_)|(^787__)|(^856_2)|(^961__)|(^962__)|(^981__)')
+@for_each_value
+@ignore_value
+def digitization(self, key, value):
+ """Digitization field."""
+ #import ipdb
+ #ipdb.set_trace()
+
+ data = {}
+ try:
+ if key == '336__':
+ data['curator_split_comment'] = value.get('a', '')
+ data['curator_split_time'] = value.get('b', '')
+
+ elif key == '337__':
+ data['media_type'] = value.get('a', '')
+
+ elif key == '5081_':
+ data['director_info'] = value.get('a', '')
+
+ elif key == '514__':
+ data['picturae_media_quality'] = value.get('a', '')
+
+ elif key == '5831_':
+ data['quality_control_info'] = [value.get(code) for code in ['3', '5', '6', 'a', 'b', 'c', 'f', 'i', 'k', 'l', 'n', 'o', 'u', 'x', 'z'] if value.get(code)]
+
+ elif key =='583__':
+ data['curated'] = value.get('a', '')
+ data['curation_date'] = value.get('c', '')
+ data['curation_quality_control'] = value.get('z', '')
+
+ elif key =='594__':
+ data['curator_category'] = value.get('a', '')
+
+ elif key == '597__':
+ data['internal_note'] = value.get('a', '')
+
+ elif key == '65027':
+ data['epfl_category'] = value.get('a', '')
+
+ elif key == '690C_':
+ data['collection'] = value.get('a', '')
+
+ elif key == '7870_':
+ data['related_links_info'] = [value.get(code) for code in ['i', 'r', 'w'] if value.get(code)]
+
+ elif key == '787__':
+ data['related_links_info'] = data['related_links_info'] = [value.get(code) for code in ['1', 'a', 'i', 'w'] if value.get(code)]
+
+ elif key == '856_2':
+ data['subtitle_extension'] = value.get('q', '')
+ data['subtitle_path'] = value.get('u', '')
+
+ if value.get('x'):
+ data['subtitle_language'] = value.get('x', '')
+ else:
+ data['subtitle_language'] = value.get('y', '')
+
+ data['subtitle_note'] = value.get('z', '')
+
+ elif key == '961__':
+ data['curator_name'] = value.get('a', '')
+ data['curator_title'] = value.get('b', '')
+ data['curation_time'] = value.get('h', '')
+
+ elif key == '962__':
+ data['conference_cds_recid'] = value.get('b', '')
+ data['conference_cds_id'] = value.get('n', '')
+
+ elif key == '981__':
+ data['deleted_cds_records'] = value.get('a', '')
+
+ except Exception as exception:
+ #print(exception)
+ pass
+
+ empty_keys = [aux_key for aux_key in data.keys() if data[aux_key] == '']
+ for aux_key in empty_keys:
+ data.pop(aux_key)
+
+ if len(data.keys()) > 0:
+ return data
+
+ return None
@model.over('_project_id', '^773__')
@ignore_value
def project_id(self, key, value):
"""Report number."""
+ data = {}
+ data['host_item_entry'] = value.get('o', '')
+ data['library_report_number'] = value.get('r', '')
+
+ empty_keys = [aux_key for aux_key in data.keys() if data[aux_key] == '']
+ for aux_key in empty_keys:
+ data.pop(aux_key)
+
+ if len(data.keys()) > 0:
+ if '_digitization' not in self.keys():
+ self['_digitization'] = [data]
+ else:
+ self['_digitization'].append(data)
+
values = force_list(value)
project_id = None
related_links = self.get('related_links', [])
@@ -135,9 +279,12 @@ def project_id(self, key, value):
return project_id
-@model.over('location', '^110__')
+@model.over('location', '(^110__)|(^901__)')
def location(self, key, value):
"""Location."""
+ if key == '901__' and 'location' not in self.keys():
+ return value.get('u')
+
return value.get('a')
@@ -153,10 +300,20 @@ def internal_note(self, key, value):
if v.get('a') in CATEGS:
_internal_categories[v.get('a')].append(v.get('s'))
else:
- _internal_notes.append(v.get('a'))
+ if v.get('a') is not None:
+ _internal_notes.append(v.get('a'))
+ else:
+ _internal_notes.append('No Category')
if _internal_categories:
self['internal_categories'] = dict(_internal_categories)
+
+ if value.get('d'):
+ if '_digitization' not in self.keys():
+ self['_digitization'] = [{'internal_note_datetime': value.get('d')}]
+ else:
+ self['_digitization'].append({'internal_note_datetime': value.get('d')})
+
return '\n'.join(_internal_notes) or None
@@ -181,25 +338,90 @@ def accelerator_experiment(self, key, value):
'project': value.get('p'),
}
-
-@model.over('date', '^269__')
+@model.over('date', '(^269__)|(^260__)')
def date(self, key, value):
"""Date."""
- return arrow.get(value.get('c')).strftime('%Y-%m-%d')
+ if value.get('c') is None:
+ return 'No Date'
+ if key == '269__':
+ try:
+ if type(value.get('c')) is tuple:
+ return arrow.get(value.get('c')[0]).strftime('%Y-%m-%d')
+ else:
+ return arrow.get(value.get('c')).strftime('%Y-%m-%d')
+
+ except:
+ if type(value.get('c')) is tuple:
+ match = re.search(r'^(19|20)\d\d-(0[0-9]|1[012])-00', value.get('c')[0])
+ else:
+ match = re.search(r'^(19|20)\d\d-(0[0-9]|1[012])-00', value.get('c'))
+
+ if match is not None:
+ return match.string.replace('-00', '')
+ else:
+ return 'No Date'
+
+ else:
+ try:
+ if type(value.get('c')) is tuple:
+ return arrow.get(value.get('c')[0]).strftime('%Y')
+ else:
+ return arrow.get(value.get('c')).strftime('%Y')
+
+ except:
+ return 'No Date'
-@model.over('copyright', '^542__')
+
+@model.over('copyright', '(^269__)|(^542__)|(^5421_)')
@filter_values
def copyright(self, key, value):
"""Copyright."""
- return {
- 'holder': value.get('d'),
- 'year': value.get('g'),
- 'message': value.get('f'),
- }
+ if key == '269__':
+ if value.get('b'):
+ return {
+ 'holder': value.get('b')
+ }
+ return {'holder': ''}
+
+ if key == '5421_':
+
+ if value.get('a'):
+ if '_digitization' not in self.keys():
+ self['_digitization'] = [{'copyright': value.get('a')}]
+ else:
+ self['_digitization'].append({'copyright': value.get('a')})
+
+ if 'copyright' not in self.keys():
+ try:
+ if value.get('a'):
+ return {
+ 'holder': value.get('a'),
+ 'year': value.get('g')
+ }
+ else:
+ return {
+ 'holder': value.get('d'),
+ 'year': value.get('g')
+ }
+ except:
+ return {'holder': ''}
+
+ if value.get('a'):
+ return {
+ 'holder': value.get('a'),
+ 'year': value.get('g'),
+ 'message': value.get('f'),
+ }
+ else:
+ return {
+ 'holder': value.get('d'),
+ 'year': value.get('g'),
+ 'message': value.get('f'),
+ }
-@model.over('_files', '^8567_')
+@model.over('_files', '^(8567|8564)_')
@for_each_value
@filter_values
def _files(self, key, value):
@@ -246,9 +468,14 @@ def get_tags(context_type, value):
def get_filepath(value):
if value.get('d'):
- return value.get('d')[
- len('\\\\cern.ch\\dfs\\Services\\'):
- ].replace('\\', '/')
+ if 'cern.ch\\dfs\\Services' in value.get('d'):
+ return value.get('d')[
+ len('\\\\cern.ch\\dfs\\Services\\'):
+ ].replace('\\', '/')
+
+ else:
+ return 'http://cern.ch' + value.get('d').split('www')[-1]
+
else:
return re.sub(
'https?://mediaarchive.cern.ch/', '', value.get('u', '')
@@ -263,7 +490,7 @@ def get_tags_to_guess_preset(context_type, value):
def get_tags_to_transform(context_type, value):
if context_type in ['frame', 'poster']:
- return {'timestamp': int(value.get('y').split(' ')[3])}
+ return {'timestamp': int(float(value.get('y').split(' ')[3]))}
def get_frame_name(result):
_, ext = os.path.splitext(result['key'])
@@ -296,18 +523,66 @@ def compute(value, context_type, media_type):
return result
- result = compute(deepcopy(value), *get_context_type(value))
+ if key == '8567_':
+ result = compute(deepcopy(value), *get_context_type(value))
+
+ # if it's the poster frame, make a copy for a frame!
+ if result['tags']['context_type'] == 'poster' and \
+ result['tags_to_transform']['timestamp'] == 5:
+ frame_5 = compute(value, 'frame', 'image')
+ if '_files' not in self:
+ self['_files'] = []
+ self['_files'].append(frame_5)
+ # update posterframe key name
+ _, ext = os.path.splitext(result['key'])
+ result['key'] = 'posterframe{0}'.format(ext)
+
+ else:
+ data = {}
+ if value.get('1'):
+ data['has_subtitles'] = value.get('1', '')
+ else:
+ data['has_subtitles'] = value.get('i', '')
+ data['storage_service'] = value.get('2', '')
+ data['file_size'] = value.get('s', '')
+ data['record_control_number'] = value.get('w', '')
+ data['record_id'] = value.get('y', '')
+ data['format_resolution'] = value.get('z', '')
+
+ empty_keys = [aux_key for aux_key in data.keys() if data[aux_key] == '']
+ for aux_key in empty_keys:
+ data.pop(aux_key)
+
+ if len(data.keys()) > 0:
+ if '_digitization' not in self.keys():
+ self['_digitization'] = [data]
+ else:
+ self['_digitization'].append(data)
- # if it's the poster frame, make a copy for a frame!
- if result['tags']['context_type'] == 'poster' and \
- result['tags_to_transform']['timestamp'] == 5:
- frame_5 = compute(value, 'frame', 'image')
- if '_files' not in self:
- self['_files'] = []
- self['_files'].append(frame_5)
- # update posterframe key name
- _, ext = os.path.splitext(result['key'])
- result['key'] = 'posterframe{0}'.format(ext)
+ result = {}
+ result['key'] = get_key(value)
+
+ result['tags'] = {}
+ if value.get('u') and value.get('q') is not None:
+ result['tags']['preview'] = True
+ result['tags']['context_type'] = 'master'
+ result['tags']['content_type'] = value.get('q').lower()
+
+ else:
+ result['tags']['preview'] = False
+ result['tags']['context_type'] = value.get('q')
+
+ if value.get('y') is None:
+ result['tags']['media_type'] = value.get('y')
+
+ else:
+ try:
+ result['tags']['media_type'] = value.get('y').split('-')[0].lower()
+ except:
+ result['tags']['media_type'] = None
+
+ result['filepath'] = value.get('u')
+ result['tags_to_transform'] = get_tags_to_transform(result['tags']['context_type'], value)
return result
diff --git a/cds_dojson/marc21/models/videos/video.py b/cds_dojson/marc21/models/videos/video.py
index 0b505c28..d42daeba 100644
--- a/cds_dojson/marc21/models/videos/video.py
+++ b/cds_dojson/marc21/models/videos/video.py
@@ -38,54 +38,55 @@ class CDSVideo(OverdoJSONSchema):
'035__9',
'035__a',
'100__9',
- '260__c',
- '269__b',
- '300__b',
- '300__c',
- '300__d',
- '300__e',
- '337__a',
+ #'260__c',
+ #'269__b',
+ #'300__b',
+ #'300__c',
+ #'300__d',
+ #'300__e',
+ #'337__a',
'5061_2',
'5061_5',
'5061_a',
'5061_f',
'5061_z',
'542__e',
- '690C_a',
+ #'690C_a',
'700__0',
'700__9',
- '773__o',
- '773__r',
- '787__i',
- '787__w',
- '852__j',
- '852__x',
+ #'773__o',
+ #'773__r',
+ #'787__i',
+ #'787__w',
+ #'852__j',
+ #'852__x',
# FIXME need to double check (see #85)
- '8564_8',
+ #'8564_8',
'8564_d',
- '8564_q',
- '8564_s',
+ #'8564_q',
+ #'8564_s',
'8564_u',
'8564_x',
- '8564_y',
- '8564_z',
+ #'8564_y',
+ #'8564_z',
+ #'8564_2',
'8567_2',
'916__s',
'916__w',
'937__c',
'960__a',
'961__c',
- '961__h',
+ #'961__h',
'961__l',
'961__x',
- '962__b',
+ #'962__b',
'962__l',
- '962__n',
+ #'962__n',
'962__t',
'963__a',
'980__a',
'980__b',
- '981__a',
+ #'981__a',
}
diff --git a/cds_dojson/marc21/utils.py b/cds_dojson/marc21/utils.py
index 8a5a6c79..317b5d33 100644
--- a/cds_dojson/marc21/utils.py
+++ b/cds_dojson/marc21/utils.py
@@ -21,6 +21,7 @@
from dojson.contrib.marc21.utils import MARC21_DTD, split_stream
from lxml import etree
from six import StringIO, binary_type, text_type
+import copy
from ..utils import MementoDict
@@ -54,12 +55,13 @@ def create_record(marcxml, correct=False, keep_singletons=True):
record.append(('leader', text))
controlfield_iterator = tree.iter(tag='{*}controlfield')
- for controlfield in controlfield_iterator:
+ for index, controlfield in enumerate(controlfield_iterator):
tag = controlfield.attrib.get('tag', '!')
text = controlfield.text or ''
if text or keep_singletons:
record.append((tag, text))
+ multi_video = set()
datafield_iterator = tree.iter(tag='{*}datafield')
for datafield in datafield_iterator:
tag = datafield.attrib.get('tag', '!')
@@ -72,6 +74,7 @@ def create_record(marcxml, correct=False, keep_singletons=True):
ind1 = ind1.replace(' ', '_')
ind2 = ind2.replace(' ', '_')
+ multi_video_with_index = False
fields = []
subfield_iterator = datafield.iter(tag='{*}subfield')
for subfield in subfield_iterator:
@@ -80,11 +83,92 @@ def create_record(marcxml, correct=False, keep_singletons=True):
if text or keep_singletons:
fields.append((code, text))
+ # Getting video indexes to create multiple records
+ if tag == '856' and code == '8':
+ multi_video_with_index = True
+ multi_video = multi_video.union({text})
+
+ # Handle the not indexed video
+ if tag == '856' and not multi_video_with_index:
+ multi_video = multi_video.union({'not_indexed'})
+
if fields or keep_singletons:
key = '{0}{1}{2}'.format(tag, ind1, ind2)
record.append((key, MementoDict(fields)))
- return MementoDict(record)
+ # Creating multiple records
+ tags_indexes = {video: {} for video in multi_video}
+ tags_counter = {video: 0 for video in multi_video}
+ multi_video_dict = {video: [] for video in multi_video}
+ for tag in record:
+ # Tags with no code or with codes, but no '8' code
+ if type(tag[1]) is not MementoDict or '8' not in tag[1].keys():
+
+ # Propagating non-ndexed information to all videos
+ if tag[0][:3] != '856':
+ for video in multi_video:
+ multi_video_dict[video].append(copy.deepcopy(tag))
+
+ if not(tag[0] in tags_indexes[video]):
+ tags_indexes[video][tag[0]] = tags_counter[video]
+
+ tags_counter[video] += 1
+
+ # Video file special case
+ else:
+ multi_video_dict['not_indexed'].append(copy.deepcopy(tag))
+
+ if not(tag[0] in tags_indexes['not_indexed']):
+ tags_indexes['not_indexed'][tag[0]] = tags_counter['not_indexed']
+
+ tags_counter['not_indexed'] += 1
+
+
+ # Tags with code '8'
+ else:
+ # Code 8 within the indexes of videos
+ try:
+ multi_video_dict[tag[1]['8']].append(copy.deepcopy(tag))
+
+ if not(tag[0] in tags_indexes[tag[1]['8']]):
+ tags_indexes[tag[1]['8']][tag[0]] = tags_counter[tag[1]['8']]
+ tags_counter[tag[1]['8']] += 1
+
+ # Wrong code 8
+ except:
+ for video in multi_video:
+ multi_video_dict[video].append(copy.deepcopy(tag))
+
+ if not(tag[0] in tags_indexes[video]):
+ tags_indexes[video][tag[0]] = tags_counter[video]
+
+ tags_counter[video] += 1
+
+ # Removing redundant tags.
+ # Always use as (tag_to_be_removed, tag_to_be_mantained)
+ redundant_tags = [
+ ('260__', '269__')
+ ]
+
+ for redundant in redundant_tags:
+ for video in multi_video:
+ if tags_indexes[video].get(redundant[0]) is not None and tags_indexes[video].get(redundant[1]) is not None:
+
+ index_to_remove = tags_indexes[video][redundant[0]]
+ while multi_video_dict[video][index_to_remove][0] == redundant[0]:
+ multi_video_dict[video].pop(tags_indexes[video][redundant[0]])
+
+ # MARCXML with no datafield - only controlfield
+ if len(multi_video) == 0:
+ return MementoDict(record)
+
+ # Single not indexed video
+ if len(multi_video_dict.keys()) == 1:
+ key = [i for i in multi_video_dict.keys()][0]
+ return MementoDict(multi_video_dict[key])
+
+ # Multiple indexed videos
+ return [MementoDict(video_record) for video_record in multi_video_dict.values()]
def load(source):
diff --git a/cds_dojson/schemas/deposits/records/videos/video/video-v1.0.0.json b/cds_dojson/schemas/deposits/records/videos/video/video-v1.0.0.json
index fe4256e8..b9ec4d11 100644
--- a/cds_dojson/schemas/deposits/records/videos/video/video-v1.0.0.json
+++ b/cds_dojson/schemas/deposits/records/videos/video/video-v1.0.0.json
@@ -416,6 +416,144 @@
}
}
},
+ "_digitization": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "description": "Field with digitization information for old videos.",
+ "properties": {
+ "CERN_ID": {
+ "type": "string"
+ },
+ "res_ar_fps": {
+ "type": "string"
+ },
+ "FPS": {
+ "type": "string"
+ },
+ "resolution": {
+ "type": "string"
+ },
+ "aspect_ratio": {
+ "type": "string"
+ },
+ "curated": {
+ "type": "string"
+ },
+ "curator_name": {
+ "type": "string"
+ },
+ "curator_title": {
+ "type": "string"
+ },
+ "curation_date": {
+ "type": "string"
+ },
+ "curation_time": {
+ "type": "string"
+ },
+ "curation_quality_control": {
+ "type": "string"
+ },
+ "curator_category": {
+ "type": "string"
+ },
+ "curator_split_comment": {
+ "type": "string"
+ },
+ "curator_split_time": {
+ "type": "string"
+ },
+ "media_type": {
+ "type": "string"
+ },
+ "director_info": {
+ "type": "string"
+ },
+ "picturae_media_quality": {
+ "type": "string"
+ },
+ "copyright": {
+ "type": "string"
+ },
+ "quality_control_info": {
+ "items": {
+ "type": "object"
+ },
+ "type": "array"
+ },
+ "internal_note": {
+ "type": "string"
+ },
+ "internal_note_datetime": {
+ "type": "string"
+ },
+ "epfl_category": {
+ "type": "string"
+ },
+ "collection": {
+ "type": "string"
+ },
+ "host_item_entry": {
+ "type": "string"
+ },
+ "library_report_number": {
+ "type": "string"
+ },
+ "related_links_info": {
+ "items": {
+ "type": "object"
+ },
+ "type": "array"
+ },
+ "physical_media_type": {
+ "type": "string"
+ },
+ "has_copy": {
+ "type": "string"
+ },
+ "has_subtitles": {
+ "type": "string"
+ },
+ "storage_service": {
+ "type": "string"
+ },
+ "file_size": {
+ "type": "string"
+ },
+ "record_control_number": {
+ "type": "string"
+ },
+ "record_id": {
+ "type": "string"
+ },
+ "format_resolution": {
+ "type": "string"
+ },
+ "subtitle_extension": {
+ "type": "string"
+ },
+ "subtitle_path": {
+ "type": "string"
+ },
+ "subtitle_language": {
+ "type": "string"
+ },
+ "subtitle_note": {
+ "type": "string"
+ },
+ "conference_cds_recid": {
+ "type": "string"
+ },
+ "conference_cds_id": {
+ "type": "string"
+ },
+ "deleted_cds_records": {
+ "type": "string"
+ }
+ }
+ }
+ },
"translations": {
"items": {
"type": "object",
diff --git a/cds_dojson/schemas/records/videos/video/video-v1.0.0.json b/cds_dojson/schemas/records/videos/video/video-v1.0.0.json
index 647db46a..9eb170b5 100644
--- a/cds_dojson/schemas/records/videos/video/video-v1.0.0.json
+++ b/cds_dojson/schemas/records/videos/video/video-v1.0.0.json
@@ -151,6 +151,144 @@
}
}
},
+ "_digitization": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "description": "Field with digitization information for old videos.",
+ "properties": {
+ "CERN_ID": {
+ "type": "string"
+ },
+ "res_ar_fps": {
+ "type": "string"
+ },
+ "FPS": {
+ "type": "string"
+ },
+ "resolution": {
+ "type": "string"
+ },
+ "aspect_ratio": {
+ "type": "string"
+ },
+ "curated": {
+ "type": "string"
+ },
+ "curator_name": {
+ "type": "string"
+ },
+ "curator_title": {
+ "type": "string"
+ },
+ "curation_date": {
+ "type": "string"
+ },
+ "curation_time": {
+ "type": "string"
+ },
+ "curation_quality_control": {
+ "type": "string"
+ },
+ "curator_category": {
+ "type": "string"
+ },
+ "curator_split_comment": {
+ "type": "string"
+ },
+ "curator_split_time": {
+ "type": "string"
+ },
+ "media_type": {
+ "type": "string"
+ },
+ "director_info": {
+ "type": "string"
+ },
+ "picturae_media_quality": {
+ "type": "string"
+ },
+ "copyright": {
+ "type": "string"
+ },
+ "quality_control_info": {
+ "items": {
+ "type": "object"
+ },
+ "type": "array"
+ },
+ "internal_note": {
+ "type": "string"
+ },
+ "internal_note_datetime": {
+ "type": "string"
+ },
+ "epfl_category": {
+ "type": "string"
+ },
+ "collection": {
+ "type": "string"
+ },
+ "host_item_entry": {
+ "type": "string"
+ },
+ "library_report_number": {
+ "type": "string"
+ },
+ "related_links_info": {
+ "items": {
+ "type": "object"
+ },
+ "type": "array"
+ },
+ "physical_media_type": {
+ "type": "string"
+ },
+ "has_copy": {
+ "type": "string"
+ },
+ "has_subtitles": {
+ "type": "string"
+ },
+ "storage_service": {
+ "type": "string"
+ },
+ "file_size": {
+ "type": "string"
+ },
+ "record_control_number": {
+ "type": "string"
+ },
+ "record_id": {
+ "type": "string"
+ },
+ "format_resolution": {
+ "type": "string"
+ },
+ "subtitle_extension": {
+ "type": "string"
+ },
+ "subtitle_path": {
+ "type": "string"
+ },
+ "subtitle_language": {
+ "type": "string"
+ },
+ "subtitle_note": {
+ "type": "string"
+ },
+ "conference_cds_recid": {
+ "type": "string"
+ },
+ "conference_cds_id": {
+ "type": "string"
+ },
+ "deleted_cds_records": {
+ "type": "string"
+ }
+ }
+ }
+ },
"keywords": {
"items": {
"type": "object"
diff --git a/cds_dojson/schemas/records/videos/video/video_src-v1.0.0.json b/cds_dojson/schemas/records/videos/video/video_src-v1.0.0.json
index 8cbf3036..8c86cbe7 100644
--- a/cds_dojson/schemas/records/videos/video/video_src-v1.0.0.json
+++ b/cds_dojson/schemas/records/videos/video/video_src-v1.0.0.json
@@ -111,6 +111,144 @@
"_project_id": {
"type": "string"
},
+ "_digitization": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "description": "Field with digitization information for old videos.",
+ "properties": {
+ "CERN_ID": {
+ "type": "string"
+ },
+ "res_ar_fps": {
+ "type": "string"
+ },
+ "FPS": {
+ "type": "string"
+ },
+ "resolution": {
+ "type": "string"
+ },
+ "aspect_ratio": {
+ "type": "string"
+ },
+ "curated": {
+ "type": "string"
+ },
+ "curator_name": {
+ "type": "string"
+ },
+ "curator_title": {
+ "type": "string"
+ },
+ "curation_date": {
+ "type": "string"
+ },
+ "curation_time": {
+ "type": "string"
+ },
+ "curation_quality_control": {
+ "type": "string"
+ },
+ "curator_category": {
+ "type": "string"
+ },
+ "curator_split_comment": {
+ "type": "string"
+ },
+ "curator_split_time": {
+ "type": "string"
+ },
+ "media_type": {
+ "type": "string"
+ },
+ "director_info": {
+ "type": "string"
+ },
+ "picturae_media_quality": {
+ "type": "string"
+ },
+ "copyright": {
+ "type": "string"
+ },
+ "quality_control_info": {
+ "items": {
+ "type": "object"
+ },
+ "type": "array"
+ },
+ "internal_note": {
+ "type": "string"
+ },
+ "internal_note_datetime": {
+ "type": "string"
+ },
+ "epfl_category": {
+ "type": "string"
+ },
+ "collection": {
+ "type": "string"
+ },
+ "host_item_entry": {
+ "type": "string"
+ },
+ "library_report_number": {
+ "type": "string"
+ },
+ "related_links_info": {
+ "items": {
+ "type": "object"
+ },
+ "type": "array"
+ },
+ "physical_media_type": {
+ "type": "string"
+ },
+ "has_copy": {
+ "type": "string"
+ },
+ "has_subtitles": {
+ "type": "string"
+ },
+ "storage_service": {
+ "type": "string"
+ },
+ "file_size": {
+ "type": "string"
+ },
+ "record_control_number": {
+ "type": "string"
+ },
+ "record_id": {
+ "type": "string"
+ },
+ "format_resolution": {
+ "type": "string"
+ },
+ "subtitle_extension": {
+ "type": "string"
+ },
+ "subtitle_path": {
+ "type": "string"
+ },
+ "subtitle_language": {
+ "type": "string"
+ },
+ "subtitle_note": {
+ "type": "string"
+ },
+ "conference_cds_recid": {
+ "type": "string"
+ },
+ "conference_cds_id": {
+ "type": "string"
+ },
+ "deleted_cds_records": {
+ "type": "string"
+ }
+ }
+ }
+ },
"_cds": {
"type": "object",
"properties": {
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 43aacd93..c8c5fab6 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -57,4 +57,6 @@ def test_cli(src, compiled):
pkg_resources.resource_filename('cds_dojson.schemas',
compiled), 'r') as f:
compile_schema_expected = json.load(f)
+ print(compile_schema_expected)
+ print(compiled_schema_result)
assert compile_schema_expected == compiled_schema_result
diff --git a/tests/test_videos_project.py b/tests/test_videos_project.py
index f2f3f4e9..b10ccdfe 100644
--- a/tests/test_videos_project.py
+++ b/tests/test_videos_project.py
@@ -18,11 +18,11 @@
# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""Video rules tests."""
import mock
+from helpers import load_fixture_file, mock_contributor_fetch, validate
from cds_dojson.marc21.fields.videos.utils import language_to_isocode
from cds_dojson.marc21.models.videos.project import model
from cds_dojson.marc21.utils import create_record
-from helpers import load_fixture_file, mock_contributor_fetch, validate
def test_required_fields(app):
diff --git a/tests/test_videos_video.py b/tests/test_videos_video.py
index d1bdfd45..8bafda03 100644
--- a/tests/test_videos_video.py
+++ b/tests/test_videos_video.py
@@ -18,11 +18,11 @@
# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""Video rules tests."""
import mock
+from helpers import load_fixture_file, mock_contributor_fetch, validate
from cds_dojson.marc21.fields.videos.utils import language_to_isocode
from cds_dojson.marc21.models.videos.video import model
from cds_dojson.marc21.utils import create_record
-from helpers import load_fixture_file, mock_contributor_fetch, validate
def test_required_fields(app):
@@ -47,7 +47,7 @@ def test_required_fields(app):
'test-email@cern.ch',
'example@test.com'],
'update': ['another.user@cern.ch',
- 'tuser@cern.ch']},
+ 'tuser@cern.ch']},
'_files': [
{
'filepath': 'MediaArchive/Video/Masters/Movies/CERN/2017/CERN-MOVIE-2017-023/Final_Output/CERN-MOVIE-2017-023-001.mov',
@@ -202,6 +202,21 @@ def test_required_fields(app):
'tags_to_transform': {'timestamp': 95}
}
],
+ '_digitization': [
+ {
+ 'res_ar_fps': '1920x1080 16/9, 25.00',
+ 'FPS': '25',
+ 'resolution': '1920x1080',
+ 'aspect_ratio': '16:9'
+ },
+ {
+ 'collection': 'publvideomovie'
+ },
+ {
+ 'host_item_entry': 'AVW.project.2963',
+ 'library_report_number': 'CERN-MOVIE-2017-023'
+ }
+ ],
'_project_id': 'https://cds.cern.ch/record/1',
'category': 'CERN',
'contributors': [
@@ -461,6 +476,10 @@ def check_transformation(marcxml_body, json_body):
'related_links': [
{'name': 'Version anglaise', 'url': 'http://cds.cern.ch/record/43172'},
{'name': 'Version allemande', 'url': 'https://cds.cern.ch/record/2194933'},
+ ],
+ '_digitization': [
+ {'library_report_number': 'CERN-FILM-1965-44'},
+ {'host_item_entry': 'AVW.project.111', 'library_report_number': 'CERN-MOVIE-1965-001'}
]}
)
check_transformation(
@@ -519,7 +538,10 @@ def check_transformation(marcxml_body, json_body):
16:9
""", {
- 'duration': '00:00:00'
+ 'duration': '00:00:00',
+ '_digitization': [
+ {'aspect_ratio': '16:9'}
+ ]
})
check_transformation(
"""
@@ -528,7 +550,10 @@ def check_transformation(marcxml_body, json_body):
16:9
""", {
- 'duration': '12:33:12'
+ 'duration': '12:33:12',
+ '_digitization': [
+ {'aspect_ratio': '16:9'}
+ ]
})
check_transformation(
"""
@@ -537,7 +562,10 @@ def check_transformation(marcxml_body, json_body):
16:9
""", {
- 'duration': '00:00:00'
+ 'duration': '00:00:00',
+ '_digitization': [
+ {'res_ar_fps': '16:9,', 'aspect_ratio': '16:9'}
+ ]
})
check_transformation(
"""