diff --git a/harvestingkit/contrast_out.py b/harvestingkit/contrast_out.py index ec47dd4..3752244 100644 --- a/harvestingkit/contrast_out.py +++ b/harvestingkit/contrast_out.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- ## ## This file is part of Harvesting Kit. -## Copyright (C) 2013, 2014 CERN. +## Copyright (C) 2013, 2014, 2016 CERN. ## ## Harvesting Kit is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as @@ -99,7 +99,9 @@ def connect(self): try: self.ftp = FtpHandler(self.config.ELSEVIER.URL, self.config.ELSEVIER.LOGIN, - self.config.ELSEVIER.PASSWORD) + self.config.ELSEVIER.PASSWORD, + port = int(self.config.ELSEVIER.PORT), + sftp = self.config.ELSEVIER.SFTP) self.logger.debug(('Successful connection to the ' 'Elsevier server')) return diff --git a/harvestingkit/elsevier_package.py b/harvestingkit/elsevier_package.py index 638f3d5..5a85e54 100644 --- a/harvestingkit/elsevier_package.py +++ b/harvestingkit/elsevier_package.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # # This file is part of Harvesting Kit. -# Copyright (C) 2013, 2014, 2015 CERN. +# Copyright (C) 2013, 2014, 2015, 2016 CERN. # # Harvesting Kit is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as @@ -38,7 +38,6 @@ from zipfile import ZipFile from xml.dom.minidom import parse - try: from invenio.errorlib import register_exception except ImportError: @@ -423,14 +422,14 @@ def get_abstract(self, xml_doc): def get_keywords(self, xml_doc): head = xml_doc.getElementsByTagName("ja:head") - if not head: + if not head: head = xml_doc.getElementsByTagName("cja:head") if not head: keywords = xml_doc.getElementsByTagName("ce:keyword") else: keywords = head[0].getElementsByTagName("ce:keyword") - return [get_value_in_tag(keyword, "ce:text") - for keyword in keywords + return [get_value_in_tag(keyword, "ce:text") + for keyword in keywords if get_value_in_tag(keyword, "ce:text")] def get_copyright(self, xml_doc): diff --git a/harvestingkit/ftp_utils.py b/harvestingkit/ftp_utils.py index e3a4b9a..84e902a 100644 --- a/harvestingkit/ftp_utils.py +++ b/harvestingkit/ftp_utils.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- ## ## This file is part of Harvesting Kit. -## Copyright (C) 2014 CERN. +## Copyright (C) 2014, 2016 CERN. ## ## Harvesting Kit is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as @@ -28,6 +28,7 @@ from urlparse import urlparse from netrc import netrc from datetime import datetime +import paramiko class FtpHandler(object): @@ -44,25 +45,37 @@ class FtpHandler(object): for authentication with the server. :type netrc_file: string """ - def __init__(self, server, username='', passwd='', netrc_file=''): + def __init__(self, server, username='', passwd='', netrc_file='', port=21, sftp=False): + self.port = port + self.sftp = sftp server = urlparse(server) if server.netloc: server = server.netloc elif server.path: server = server.path - self._ftp = FTP(server) + if self.sftp: + self._ftp = paramiko.Transport((server, self.port)) + else: + self._ftp = FTP(server) self._username = username self._passwd = passwd if netrc_file: logininfo = netrc(netrc_file).authenticators(server) self._username, _, self._passwd = logininfo self.connect() - self._home = self._ftp.pwd() + if self.sftp: + self._home = self._sftp_client.getcwd() + else: + self._home = self._ftp.pwd() def connect(self): """ Connects and logins to the server. """ - self._ftp.connect() - self._ftp.login(user=self._username, passwd=self._passwd) + if self.sftp: + self._ftp.connect(username=self._username, password=self._passwd) + self._sftp_client = paramiko.SFTPClient.from_transport(self._ftp) + else: + self._ftp.connect(port=self.port) + self._ftp.login(user=self._username, passwd=self._passwd) def close(self): """ Closes the connection to the server. """ @@ -99,13 +112,20 @@ def download(self, source_file, target_folder=''): working directory. :type target_folder: string """ - current_folder = self._ftp.pwd() + if self.sftp: + current_folder = self._sftp_client.getcwd() + else: + current_folder = self._ftp.pwd() if not target_folder.startswith('/'): # relative path target_folder = join(getcwd(), target_folder) folder = os.path.dirname(source_file) - self.cd(folder) + if folder: + if self.sftp: + self._sftp_client.chdir(folder) + else: + self.cd(folder) if folder.startswith("/"): folder = folder[1:] @@ -118,14 +138,20 @@ def download(self, source_file, target_folder=''): source_file = os.path.basename(source_file) destination = join(destination_folder, source_file) try: - with open(destination, 'wb') as result: - self._ftp.retrbinary('RETR %s' % (source_file,), - result.write) + if self.sftp: + self._sftp_client.get(source_file, destination) + else: + with open(destination, 'wb') as result: + self._ftp.retrbinary('RETR %s' % (source_file,), + result.write) except error_perm as e: # source_file is a folder print(e) remove(join(target_folder, source_file)) raise - self._ftp.cwd(current_folder) + if self.sftp: + self._sftp_client.chdir(current_folder) + else: + self._ftp.cwd(current_folder) def cd(self, folder): """ Changes the working directory on the server. @@ -134,11 +160,17 @@ def cd(self, folder): :type folder: string """ if folder.startswith('/'): - self._ftp.cwd(folder) + if self.sftp: + self._sftp_client.chdir(folder) + else: + self._ftp.cwd(folder) else: for subfolder in folder.split('/'): if subfolder: - self._ftp.cwd(subfolder) + if self.sftp: + self._sftp_client.chdir(subfolder) + else: + self._ftp.cwd(subfolder) def ls(self, folder=''): """ Lists the files and folders of a specific directory @@ -150,15 +182,31 @@ def ls(self, folder=''): :returns: a tuple with the list of files in the folder and the list of subfolders in the folder. """ - current_folder = self._ftp.pwd() - self.cd(folder) + if self.sftp and folder == '': + folder = '.' + + files = [] + folders = [] contents = [] - self._ftp.retrlines('LIST', lambda a: contents.append(a)) - files = filter(lambda a: a.split()[0].startswith('-'), contents) - folders = filter(lambda a: a.split()[0].startswith('d'), contents) - files = map(lambda a: ' '.join(a.split()[8:]), files) - folders = map(lambda a: ' '.join(a.split()[8:]), folders) - self._ftp.cwd(current_folder) + + if self.sftp: + current_folder = self._sftp_client.getcwd() + self._sftp_client.chdir(folder) + contents = self._sftp_client.listdir() + files = filter(lambda a: str(self._sftp_client.lstat(a)).split()[0].startswith('-'), contents) + folders = filter(lambda a: str(self._sftp_client.lstat(a)).split()[0].startswith('d'), contents) + files = map(lambda a: ' '.join(a.split()[8:]), files) + folders = map(lambda a: ' '.join(a.split()[8:]), folders) + self._sftp_client.chdir(current_folder) + else: + current_folder = self._ftp.pwd() + self.cd(folder) + self._ftp.retrlines('LIST', lambda a: contents.append(a)) + files = filter(lambda a: a.split()[0].startswith('-'), contents) + folders = filter(lambda a: a.split()[0].startswith('d'), contents) + files = map(lambda a: ' '.join(a.split()[8:]), files) + folders = map(lambda a: ' '.join(a.split()[8:]), folders) + self._ftp.cwd(current_folder) return files, folders def dir(self, folder='', prefix=''): @@ -192,7 +240,11 @@ def mkdir(self, folder): :param folder: the folder to be created. :type folder: string """ - current_folder = self._ftp.pwd() + + if self.sftp: + current_folder = self._sftp_client.getcwd() + else: + current_folder = self._ftp.pwd() #creates the necessary folders on #the server if they don't exist folders = folder.split('/') @@ -200,7 +252,10 @@ def mkdir(self, folder): try: self.cd(fld) except error_perm: # folder does not exist - self._ftp.mkd(fld) + if self.sftp: + self._sftp_client.mkdir(fld) + else: + self._ftp.mkd(fld) self.cd(fld) self.cd(current_folder) @@ -211,11 +266,17 @@ def rm(self, filename): :type filename: string """ try: - self._ftp.delete(filename) + if self.sftp: + self._sftp_client.remove(filename) + else: + self._ftp.delete(filename) except error_perm: # target is either a directory # either it does not exist try: - current_folder = self._ftp.pwd() + if self.sftp: + current_folder = self._sftp_client.getcwd() + else: + current_folder = self._ftp.pwd() self.cd(filename) except error_perm: print('550 Delete operation failed %s ' @@ -232,7 +293,11 @@ def rmdir(self, foldername): :param foldername: the folder to be deleted. :type foldername: string """ - current_folder = self._ftp.pwd() + if self.sftp: + current_folder = self._sftp_client.getcwd() + else: + current_folder = self._ftp.pwd() + try: self.cd(foldername) except error_perm: @@ -241,16 +306,25 @@ def rmdir(self, foldername): else: self.cd(current_folder) try: - self._ftp.rmd(foldername) + if self.sftp: + self._sftp_client.rmdir(foldername) + else: + self._ftp.rmd(foldername) except error_perm: # folder not empty self.cd(foldername) contents = self.ls() #delete the files - map(self._ftp.delete, contents[0]) + if self.sftp: + map(self._sftp_client.remove, contents[0]) + else: + map(self._ftp.delete, contents[0]) #delete the subfolders map(self.rmdir, contents[1]) self.cd(current_folder) - self._ftp.rmd(foldername) + if self.sftp: + self._sftp_client.rmdir(foldername) + else: + self._ftp.rmd(foldername) def get_filesize(self, filename): """ Returns the filesize of a file @@ -260,19 +334,23 @@ def get_filesize(self, filename): :returns: string representation of the filesize. """ - result = [] - - def dir_callback(val): - result.append(val.split()[4]) - - self._ftp.dir(filename, dir_callback) - return result[0] + if self.sftp: + return self._sftp_client.lstat(filename).st_size + else: + result = [] + def dir_callback(val): + result.append(val.split()[4]) + self._ftp.dir(filename, dir_callback) + return result[0] def get_datestamp(self, filename): - datestamp = self._ftp.sendcmd('MDTM ' + filename) - datestamp = datetime.strptime(datestamp[4:], - "%Y%m%d%H%M%S").strftime("%Y-%M-%d") - return datestamp + if self.sftp: + datestamp = datetime.fromtimestamp(self._sftp_client.lstat(filename).st_mtime) + return datestamp.strftime("%Y-%m-%d") + else: + datestamp = self._ftp.sendcmd('MDTM ' + filename) + return datetime.strptime(datestamp[4:], + "%Y%m%d%H%M%S").strftime("%Y-%M-%d") def check_pkgs_integrity(self, filelist, logger, timeout=120, sleep_time=10): @@ -324,11 +402,17 @@ def upload(self, filename, location=''): be stored. :type location: string """ - current_folder = self._ftp.pwd() + if self.sftp: + current_folder = self._sftp_client.getcwd() + else: + current_folder = self._ftp.pwd() self.mkdir(location) self.cd(location) - fl = open(filename, 'rb') - filename = filename.split('/')[-1] - self._ftp.storbinary('STOR %s' % filename, fl) - fl.close() + if self.sftp: + self._sftp_client.put(filename, location) + else: + fl = open(filename, 'rb') + filename = filename.split('/')[-1] + self._ftp.storbinary('STOR %s' % filename, fl) + fl.close() self.cd(current_folder) diff --git a/user_config.cfg b/user_config.cfg index 840f37c..4c4b2b7 100644 --- a/user_config.cfg +++ b/user_config.cfg @@ -2,6 +2,8 @@ login = empty password = empty url = empty +port = 21 +sftp = True [OXFORD] login = empty