python_contentdm_pitcher/pitcher.py at main · kevinworthington/python_contentdm_pitcher · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#!/usr/bin/python3
# The following script is used to updated existing item records in CONTENTdm through automation using the SOAP web service known as Catcher.


# To use: update the settings.py file with your username, password, contentdm licence and the base url for you CONTENTdm server
#   Then add rows to the CatcherUpload.csv for each item update, see below for a description of the csv file.
#   Call the pitcher.py file using 'python3 pitcher.py' from the terminal

# Once complete a log of the transations will be placed in a folder called 'Completed'
# A limitation of this program is that once a record edit is made, it becomes locked, preventing additional edits to the same record
# Indexing of items allows the item change to take effect which unlocks the edited record
#   Navigating to items>index>and clicking "index now" will trigger reindexing


# This script takes a csv file of updates with four fields:
#   Alias: The CONTENTdm collection internal name for the item to be updated. This can be found in the URL for the item.
#   CDM_id: The item's internal id. Can be pulled from an export of the collection.
#   CDM_field: The metadata attribute/field name to be changed. Note these are truncated to 5-6 lowercase characters.
#       Can be verified from the CONTENTdm Administration interface under:
#       'Collection administration'>'Field Properties'>'edit'(next to the field to be updated)
#        Then on the 'edit field' page, take the string in the browser address after the '='
#            e.g https://....org/cgi-bin/admin/editfield.exe?CISODB=/p17393coll164&CISONICK=latitu
#            Should be 'latitu'
#   Value: the new value to be added
# Note: one value per row
# Script set to iterate through the current directory and process any csv files beginning with "CatcherUpload"
# Transaction files are output to "Completed" directory with timestamp.

# Adapted from https://gist.github.com/saverkamp/9198310
# Original Author: Shawn Averkamp, 2014-02-24


# Updated to Python 3 by: Kevin Worthington, Colorado State University

# OCLC Catcher Documentation:
# https://help.oclc.org/@api/deki/pages/12875/pdf/Guide%2bto%2bthe%2bCONTENTdm%2bCatcher.pdf?stylesheet=default
# https://help.oclc.org/Metadata_Services/CONTENTdm/CONTENTdm_Catcher/Download_the_CONTENTdm_Catcher?sl=en
# https://help.oclc.org/Metadata_Services/CONTENTdm/CONTENTdm_Catcher/Guide_to_the_CONTENTdm_Catcher


# Last modified: 2022-08-26

import urllib
import urllib.request
from suds.client import Client
from suds.transport.https import HttpAuthenticated
from suds.transport import TransportError
import datetime
import csv
import os
import fnmatch
import socket

# set variables for SOAP connection--requires config.ini file

from settings import *

port = '8888'# was '81' you should not need to update
url = base + ":" + port
WSDL='https://www.oclc.org/content/dam/community/CONTENTdm/downloads-addons/CatcherService.xml'
#WSDL='https://worldcat.org/webservices/contentdm/catcher/6.0/CatcherService.wsdl'

# https://stackoverflow.com/questions/25083855/403-when-retrieving-a-wsdl-via-python-suds
class HttpHeaderModify(HttpAuthenticated):
    def open(self, request):
        try:
            url = request.url
            u2request = urllib.request.Request(url, headers={'User-Agent': 'Mozilla'})
            self.proxy = self.options.proxy
            return self.u2open(u2request)
        except urllib.error.HTTPError as e:
            raise TransportError(str(e), e.code, e.fp)
        except socket.timeout as err:
            print(err)


class Catcher(object):
    """A CONTENTdm Catcher session."""
    def __init__(self, url=url, user=user, password=password, license=license):
        print("init...")
        self.transactions = []
        self.client = Client(WSDL, transport=HttpHeaderModify(), timeout=100)
        self.client.set_options(headers={'User-Agent': 'Mozilla'})
        print(self.client)
        self.url = url
        self.user = user
        self.password = password
        self.license = license

    def processCONTENTdm(self, action, user, password, license, alias, metadata):
    # function to connect to CatcherServices and process metadata updates
        transaction = self.client.service.processCONTENTdm(action, url, user, password, license, alias, metadata)
        self.transactions.append(transaction)

    def edit(self, alias, recordid, field, value):
    #function to edit metadata--call packageMetadata and processCONTENTdm
        metadata = self.packageMetadata('edit', recordid, field, value)
        self.processCONTENTdm('edit', self.user, self.password, self.license, alias, metadata)

    def packageMetadata(self, action, recordid, field, value):
    #function to package metadata in metadata wrapper
        action = action
        if action == 'edit':
            metadata = self.client.factory.create('metadataWrapper')
            metadata.metadataList = self.client.factory.create('metadataWrapper.metadataList')
            metadata1 = self.client.factory.create('metadata')
            metadata1.field = 'dmrecord'
            metadata1.value = recordid
            metadata2 = self.client.factory.create('metadata')
            metadata2.field = field
            metadata2.value = value
            metadata.metadataList.metadata = [metadata1, metadata2]
        return metadata

def UnicodeDictReader(str_data, encoding, **kwargs):
    return csv.DictReader(str_data, **kwargs)
    # # Decode the keys once
    # keymap = dict((k, k.decode(encoding)) for k in csv_reader.fieldnames)
    # for row in csv_reader:
    #     yield dict((keymap[k], v.decode(encoding)) for k, v in row.iteritems())

#iterate through current directory to find any files starting with 'CatcherUpload' and process
for file in os.listdir('.'):
    if fnmatch.fnmatch(file, 'CatcherUpload*.csv'):
        csvfile = open(file, 'r')
        c = UnicodeDictReader(csvfile, encoding='utf-8')

        # create directory for completed files and transaction logs if it doesn't exist
        newdir = 'Completed'
        if not os.path.isdir(newdir):
            os.mkdir(newdir)

        # create csv file for transactions
        today = datetime.datetime.now().strftime('%Y-%m-%d--%H-%M')
        fname = 'Completed/Transactions_' + file[0:-4] + '_' + today + '.csv'
        f = open(fname, 'w')
        fwtr = csv.writer(f)
        header = ['Alias', 'CDM_id', 'CDM_field', 'Transaction']
        fwtr.writerow(header)

        #initialize Catcher session
        session = Catcher({})

        #iterate through rows in csv and process through Catcher
        for row in c:
            # get values from row and package metadata for edits
            cdmid = row['CDM_page_id']
            alias = row['Alias']
            field = row['CDM_field']
            value = row['Value']
            # package metadata for Catcher and upload if value is not empty
            if value != '':
                print("Attempting",alias, cdmid, field, value)
                session.edit(alias, cdmid, field, value)
                # write transaction message to file
                fRow = [alias, cdmid, field, session.transactions[-1]]
            else:
                fRow = [alias, cdmid, field, 'No content--not uploaded']
            fwtr.writerow(fRow)

        f.close()
        csvfile.close()
        # append timestamp to upload csv filename and move to Completed directory
        # newcsv = 'Completed/' + file[0:-4] + '_' + today + '.csv'
        # oldfile = file
        # os.rename(oldfile, newcsv)