diff --git a/ace/__init__.py b/ace/__init__.py index 948311f..582c968 100644 --- a/ace/__init__.py +++ b/ace/__init__.py @@ -1,8 +1,11 @@ -# emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- -# ex: set sts=4 ts=4 sw=4 et: +# emacs: -*- mode: python-mode; py-indent-offset: 4; indent-tabs-mode: nil -*- +# ex: set sts=4 sw=4 et: """ACE -- Automated Coordinate Extraction. """ -__all__ = ["config", "ingest", "database", "datatable", "set_logging_level", "scrape", "sources", "tableparser", "tests", "__version__"] +__all__ = [ + "config", "ingest", "database", "datatable", "set_logging_level", + "scrape", "sources", "tableparser", "tests", "__version__" +] import logging import sys @@ -10,24 +13,31 @@ from .version import __version__ + def set_logging_level(level=None): """Set package-wide logging level - Args - level : Logging level constant from logging module (warning, error, info, etc.) + Args: + level: Logging level constant from logging module + (warning, error, info, etc.) """ if level is None: level = os.environ.get('ACE_LOGLEVEL', 'warn') logger.setLevel(getattr(logging, level.upper())) return logger.getEffectiveLevel() + def _setup_logger(logger): # Basic logging setup console = logging.StreamHandler(sys.stdout) - console.setFormatter(logging.Formatter("%(levelname)-6s %(module)-7s %(message)s")) + formatter = logging.Formatter( + "%(levelname)-6s %(module)-7s %(message)s" + ) + console.setFormatter(formatter) logger.addHandler(console) set_logging_level() + # Set up logger logger = logging.getLogger("ace") _setup_logger(logger) \ No newline at end of file diff --git a/ace/config.py b/ace/config.py index e5dd22b..cf41d02 100644 --- a/ace/config.py +++ b/ace/config.py @@ -62,33 +62,129 @@ SAVE_ORIGINAL_HTML = False - - ''' SCRAPING/PARSING SETTINGS ''' USER_AGENTS = [ - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36', - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.201 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36', - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.118 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.201 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36', - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.118 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36', - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.207 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.60 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36', - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.60 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36', - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.58 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.58 Safari/537.36', - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.128 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.207 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.6422.60 Safari/537.36' + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36', # noqa: E501 + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36', # noqa: E501 + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36', # noqa: E501 + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36', # noqa: E501 + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.201 Safari/537.36', # noqa: E501 + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36', # noqa: E501 + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36', # noqa: E501 + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.118 Safari/537.36', # noqa: E501 + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.201 Safari/537.36', # noqa: E501 + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36', # noqa: E501 + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36', # noqa: E501 + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.118 Safari/537.36', # noqa: E501 + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36', # noqa: E501 + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.207 Safari/537.36', # noqa: E501 + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.60 Safari/537.36', # noqa: E501 + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36', # noqa: E501 + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.60 Safari/537.36', # noqa: E501 + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36', # noqa: E501 + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36', # noqa: E501 + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36', # noqa: E501 + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.58 Safari/537.36', # noqa: E501 + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.58 Safari/537.36', # noqa: E501 + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.128 Safari/537.36', # noqa: E501 + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.207 Safari/537.36', # noqa: E501 + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.6422.60 Safari/537.36' # noqa: E501 ] + + +class ConfigManager: + """Manages runtime configuration settings for ACE""" + + _defaults = {} + _overrides = {} + + def __init__(self): + # Capture initial defaults + self._defaults = { + 'SILENT_ERRORS': SILENT_ERRORS, + 'SQL_ADAPTER': SQL_ADAPTER, + 'SQLITE_URI': SQLITE_URI, + 'MYSQL_USER': MYSQL_USER, + 'MYSQL_PASSWORD': MYSQL_PASSWORD, + 'MYSQL_DB': MYSQL_DB, + 'SAVE_ARTICLES_WITHOUT_ACTIVATIONS': + SAVE_ARTICLES_WITHOUT_ACTIVATIONS, + 'OVERWRITE_EXISTING_ROWS': OVERWRITE_EXISTING_ROWS, + 'CAREFUL_PARSING': CAREFUL_PARSING, + 'IGNORE_BAD_ROWS': IGNORE_BAD_ROWS, + 'EXCLUDE_TABLES_WITH_MISSING_LABELS': + EXCLUDE_TABLES_WITH_MISSING_LABELS, + 'USE_READABILITY': USE_READABILITY, + 'SAVE_ORIGINAL_HTML': SAVE_ORIGINAL_HTML + } + + def update(self, **kwargs): + """Update configuration settings at runtime + + Args: + **kwargs: Key-value pairs of configuration settings to update + """ + valid_keys = self._defaults.keys() + for key, value in kwargs.items(): + if key in valid_keys: + self._overrides[key] = value + else: + raise ValueError(f"Invalid config key: {key}") + + def reset(self, key=None): + """Reset configuration to default values + + Args: + key (str): Specific key to reset (reset all if None) + """ + if key: + if key in self._overrides: + del self._overrides[key] + else: + self._overrides = {} + + def __getattr__(self, name): + if name in self._overrides: + return self._overrides[name] + elif name in self._defaults: + return self._defaults[name] + else: + raise AttributeError(f"Config setting {name} does not exist") + + +# Create global config manager instance +config_manager = ConfigManager() + + +# Proxy functions for easier access +def update_config(**kwargs): + """Update configuration settings at runtime""" + config_manager.update(**kwargs) + + +def reset_config(key=None): + """Reset configuration to default values""" + config_manager.reset(key) + + +def get_config(key): + """Get current configuration value""" + return getattr(config_manager, key) + + +# Expose config settings through the manager +SILENT_ERRORS = config_manager.SILENT_ERRORS +SQL_ADAPTER = config_manager.SQL_ADAPTER +SQLITE_URI = config_manager.SQLITE_URI +MYSQL_USER = config_manager.MYSQL_USER +MYSQL_PASSWORD = config_manager.MYSQL_PASSWORD +MYSQL_DB = config_manager.MYSQL_DB +SAVE_ARTICLES_WITHOUT_ACTIVATIONS = \ + config_manager.SAVE_ARTICLES_WITHOUT_ACTIVATIONS +OVERWRITE_EXISTING_ROWS = config_manager.OVERWRITE_EXISTING_ROWS +CAREFUL_PARSING = config_manager.CAREFUL_PARSING +IGNORE_BAD_ROWS = config_manager.IGNORE_BAD_ROWS +EXCLUDE_TABLES_WITH_MISSING_LABELS = \ + config_manager.EXCLUDE_TABLES_WITH_MISSING_LABELS +USE_READABILITY = config_manager.USE_READABILITY +SAVE_ORIGINAL_HTML = config_manager.SAVE_ORIGINAL_HTML diff --git a/ace/database.py b/ace/database.py index 95181e4..cdfcac9 100644 --- a/ace/database.py +++ b/ace/database.py @@ -1,8 +1,10 @@ # Database stuff and models -from sqlalchemy import (TypeDecorator, Table, Column, Integer, Float, String, Boolean, - ForeignKey, DateTime, Text) -from sqlalchemy.orm import relationship, backref, sessionmaker +from sqlalchemy import ( + TypeDecorator, Table, Column, Integer, Float, String, Boolean, + ForeignKey, DateTime, Text +) +from sqlalchemy.orm import relationship, sessionmaker from sqlalchemy import create_engine from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.associationproxy import association_proxy @@ -13,9 +15,8 @@ import logging import sys from os import path -import datetime -from . import config +from .config import get_config from . import extract logger = logging.getLogger(__name__) @@ -32,22 +33,32 @@ def __init__(self, adapter=None, db_name=None, user=None, password=None): ''' Connect to DB and initialize instance. ''' # Default to settings in config file if none passed - if adapter is None: adapter = config.SQL_ADAPTER + if adapter is None: + adapter = get_config('SQL_ADAPTER') # Generate DB URI if adapter == 'sqlite': - db_uri = config.SQLITE_URI if db_name is None else db_name + db_uri = get_config('SQLITE_URI') if db_name is None else db_name elif adapter == 'mysql': - db_name = config.MYSQL_DB if db_name is None else db_name - if user is None: user = config.MYSQL_USER - if password is None: password = config.MYSQL_PASSWORD - db_uri = 'mysql://%s:%s@localhost/%s' % (user, password, db_name) + db_name = get_config('MYSQL_DB') if db_name is None else db_name + if user is None: + user = get_config('MYSQL_USER') + if password is None: + password = get_config('MYSQL_PASSWORD') + db_uri = f'mysql://{user}:{password}@localhost/{db_name}' else: - raise ValueError("Value of SQL_ADAPTER in settings must be either 'sqlite' or 'mysql'") + raise ValueError( + "SQL_ADAPTER must be either 'sqlite' or 'mysql'" + ) - engine = create_engine(db_uri, echo=False, connect_args={'timeout': 15}) + engine = create_engine( + db_uri, + echo=False, + connect_args={'timeout': 15} + ) - if adapter == 'mysql': engine.execute("SET sql_mode=''") + if adapter == 'mysql': + engine.execute("SET sql_mode=''") Session = sessionmaker(bind=engine) Base.metadata.create_all(engine) @@ -60,8 +71,6 @@ def add(self, record): def save(self): ''' Commit all stored records to file. ''' self.session.commit() - # except Exception as err: - # print(err) def delete_article(self, pmid): article = self.session.query(Article).filter_by(id=pmid).first() @@ -71,23 +80,37 @@ def delete_article(self, pmid): def print_stats(self): ''' Summarize the current state of the DB. ''' n_articles = self.session.query(Article).count() - n_articles_with_coordinates = self.session.query(Article).join(Table).filter(Table.n_activations>0).distinct('article_id').count() + n_articles_with_coordinates = self.session.query(Article) \ + .join(Table) \ + .filter(Table.n_activations > 0) \ + .distinct('article_id') \ + .count() n_tables = self.session.query(Table).count() n_activations = self.session.query(Activation).count() n_links = self.session.query(NeurovaultLink).count() - n_articles_with_links = self.session.query(NeurovaultLink).distinct('article_id').count() - print(f"The database currently contains: {n_articles} articles.\n" - f"{n_articles_with_coordinates} have coordinates, and {n_articles_with_links} have NeuroVault links.\n" - f"Total of {n_tables} tables, {n_activations} activations and {n_links} NeuroVault links.") + n_articles_with_links = self.session.query(NeurovaultLink) \ + .distinct('article_id') \ + .count() + + print( + f"The database currently contains: {n_articles} articles.\n" + f"{n_articles_with_coordinates} have coordinates, and " + f"{n_articles_with_links} have NeuroVault links.\n" + f"Total of {n_tables} tables, {n_activations} activations " + f"and {n_links} NeuroVault links." + ) def article_exists(self, pmid): ''' Check if an article already exists in the database. ''' - return self.session.query(exists().where(Article.id==pmid)).scalar() + return self.session.query( + exists().where(Article.id == pmid) + ).scalar() @property def articles(self): return self.session.query(Article).all() + # Create a JSONString column type for convenience class JsonString(TypeDecorator): impl = Text @@ -121,16 +144,25 @@ class Article(Base): abstract = Column(Text) citation = Column(Text) pubmed_metadata = Column(JsonString) - created_at = Column(DateTime, default=datetime.datetime.utcnow) - updated_at = Column(DateTime, default=datetime.datetime.utcnow, - onupdate=datetime.datetime.utcnow) - - tables = relationship('Table', cascade="all, delete-orphan", - backref='article') - - neurovault_links = relationship('NeurovaultLink', cascade="all, delete-orphan", - backref='article') - + created_at = Column(DateTime, default=datetime.utcnow) + updated_at = Column( + DateTime, + default=datetime.utcnow, + onupdate=datetime.utcnow + ) + + tables = relationship( + 'Table', + cascade="all, delete-orphan", + backref='article' + ) + + neurovault_links = relationship( + 'NeurovaultLink', + cascade="all, delete-orphan", + backref='article' + ) + features = association_proxy('tags', 'feature') def __init__(self, text, pmid=None, doi=None, metadata=None): @@ -160,8 +192,11 @@ class Table(Base): id = Column(Integer, primary_key=True) article_id = Column(Integer, ForeignKey('articles.id')) - activations = relationship('Activation', cascade="all, delete-orphan", - backref='table') + activations = relationship( + 'Activation', + cascade="all, delete-orphan", + backref='table' + ) position = Column(Integer) # The serial position of occurrence number = Column(String(10)) # The stated table ID (e.g., 1, 2b) label = Column(String(200)) # The full label (e.g., Table 1, Table 2b) @@ -172,19 +207,7 @@ class Table(Base): input_html = Column(LongText) def finalize(self): - ''' Any cleanup and updating operations we need to do before saving. ''' - - # # Remove duplicate activations--most commonly produced by problems with - # # the grouping code. - # act_defs = set() - # to_keep = [] - # for a in self.activations: - # definition = json.dumps([a.x, a.y, a.z, a.groups]) - # if definition not in act_defs: - # act_defs.add(definition) - # to_keep.append(a) - # self.activations = to_keep - + ''' Any cleanup before saving. ''' self.n_activations = len(self.activations) @@ -234,35 +257,35 @@ def add_col(self, key, val): # Validates Peak. Considers peak invalid if: # * At least one of X, Y, Z is nil or missing # * Any |coordinate| > 100 - # * Two or more columns are zeroes (most of the time this - # will indicate a problem, but occasionally a real coordinate) - # Depending on config, either excludes peak, or allows it through - # but flags potential problems for later inspection. + # * Two or more columns are zeroes def validate(self): - for c in [self.x, self.y, self.z]: if c == '' or c is None: - logger.debug('Missing x, y, or z coordinate information: [%s, %s, %s]' % tuple( - [str(e) for e in [self.x, self.y, self.z]])) + logger.debug( + 'Missing x, y, or z coordinate: [%s, %s, %s]', + self.x, self.y, self.z + ) return False try: if abs(c) >= 100: logger.debug( - 'Invalid coordinates: at least one dimension (x,y,z) >= 100.') + 'Invalid coordinates: dimension >= 100.' + ) return False - except: - print(c) - print(sys.exc_info()[0]) + except Exception: + logger.exception("Error validating coordinate") raise sorted_xyz = sorted([abs(self.x), abs(self.y), abs(self.z)]) if sorted_xyz[0] == 0 and sorted_xyz[1] == 0: logger.debug( - "At least two dimensions have value == 0; coordinate is probably not real.") + "Two dimensions have value 0; probably not real." + ) return False return True + class NeurovaultLink(Base): __tablename__ = 'Neurovaultlinks' diff --git a/ace/export.py b/ace/export.py index b336d63..b9d7ddd 100644 --- a/ace/export.py +++ b/ace/export.py @@ -9,7 +9,7 @@ logger = logging.getLogger(__name__) -def export_database(db, foldername, skip_empty=True): +def export_database(db, foldername, skip_empty=True, table_html=False): # Create folder if it doesn't exist foldername = Path(foldername) foldername.mkdir(parents=True, exist_ok=True) @@ -100,16 +100,17 @@ def export_database(db, foldername, skip_empty=True): with (foldername / 'export.json').open('w') as f: json.dump(export_md, f) - # Save table HTML files if available - tables_dir = foldername / 'tables' - tables_dir.mkdir(parents=True, exist_ok=True) - - for art in articles: - art_dir = tables_dir / str(art.id) - art_dir.mkdir(parents=True, exist_ok=True) + if table_html: + # Save table HTML files if available + tables_dir = foldername / 'tables' + tables_dir.mkdir(parents=True, exist_ok=True) - for t in art.tables: - if t.input_html: - table_file = art_dir / f"{t.id}.html" - with table_file.open('w', encoding='utf-8') as f: - f.write(t.input_html) \ No newline at end of file + for art in articles: + art_dir = tables_dir / str(art.id) + + for t in art.tables: + if t.input_html: + art_dir.mkdir(parents=True, exist_ok=True) + table_file = art_dir / f"{t.id}.html" + with table_file.open('w', encoding='utf-8') as f: + f.write(t.input_html) \ No newline at end of file diff --git a/ace/ingest.py b/ace/ingest.py index cb7a9d9..af117c1 100644 --- a/ace/ingest.py +++ b/ace/ingest.py @@ -1,7 +1,8 @@ from os import path import logging import re -from . import sources, config +from . import sources +from .config import get_config from .scrape import _validate_scrape import multiprocessing as mp from functools import partial @@ -10,16 +11,16 @@ logger = logging.getLogger(__name__) def _process_file_with_source(args): - """Helper function to read, validate, and identify source for a single file.""" + """Helper function to read, validate, and identify source for a file.""" f, source_configs = args try: html = open(f).read() except Exception as e: - logger.warning("Failed to read file %s: %s" % (f, str(e))) + logger.warning("Failed to read file %s: %s", f, str(e)) return f, None, None if not _validate_scrape(html): - logger.warning("Invalid HTML for %s" % f) + logger.warning("Invalid HTML for %s", f) return f, None, None # Identify source from HTML using regex patterns @@ -27,7 +28,7 @@ def _process_file_with_source(args): for name, identifiers in source_configs.items(): for patt in identifiers: if re.search(patt, html): - logger.debug('Matched article to Source: %s' % name) + logger.debug('Matched article to Source: %s', name) source_name = name break if source_name: @@ -49,48 +50,25 @@ def _parse_article(args): # Fallback to original source identification source = manager.identify_source(html) if source is None: - logger.info("Could not identify source for %s" % f) + logger.info("Could not identify source for %s", f) return f, None article = source.parse_article(html, pmid, metadata_dir=metadata_dir, **kwargs) return f, article except Exception as e: - logger.info("Error parsing article %s: %s" % (f, str(e))) + logger.info("Error parsing article %s: %s", f, str(e)) return f, None def add_articles(db, files, commit=True, table_dir=None, limit=None, - pmid_filenames=False, metadata_dir=None, force_ingest=True, num_workers=None, use_readability=None, **kwargs): - ''' Process articles and add their data to the DB. - Args: - files: The path to the article(s) to process. Can be a single - filename (string), a list of filenames, or a path to pass - to glob (e.g., "article_ls dir/NIMG*html") - commit: Whether or not to save records to DB file after adding them. - table_dir: Directory to store downloaded tables in (if None, tables - will not be saved.) - limit: Optional integer indicating max number of articles to add - (selected randomly from all available). When None, will add all - available articles. - pmid_filenames: When True, assume that the file basename is a PMID. - This saves us from having to retrieve metadata from PubMed When - checking if a file is already in the DB, and greatly speeds up - batch processing when overwrite is off. - metadata_dir: Location to read/write PubMed metadata for articles. - When None (default), retrieves new metadata each time. If a - path is provided, will check there first before querying PubMed, - and will save the result of the query if it doesn't already - exist. - force_ingest: Ingest even if no source is identified. - num_workers: Number of worker processes to use when processing in parallel. - If None (default), uses the number of CPUs available on the system. - use_readability: When True, use readability.py for HTML cleaning if available. - When False, use fallback HTML processing by default. If None (default), - uses the value from config.USE_READABILITY. - kwargs: Additional keyword arguments to pass to parse_article. - ''' - - manager = sources.SourceManager(table_dir, use_readability=use_readability if use_readability is not None else config.USE_READABILITY) + pmid_filenames=False, metadata_dir=None, force_ingest=True, num_workers=None, + use_readability=None, **kwargs): + ''' Process articles and add their data to the DB. ''' + manager = sources.SourceManager( + table_dir, + use_readability=use_readability if use_readability is not None + else get_config('USE_READABILITY') + ) # Prepare source configurations for parallel processing source_configs = {name: source.identifiers for name, source in manager.sources.items()} @@ -103,68 +81,63 @@ def add_articles(db, files, commit=True, table_dir=None, limit=None, shuffle(files) files = files[:limit] - missing_sources = [] - - # Step 1: Process files in parallel to extract HTML content and identify sources + # Step 1: Process files to extract HTML and identify sources if num_workers is not None and num_workers != 1: - # Process files in parallel to extract HTML content and identify sources process_args = [(f, source_configs) for f in files] with mp.Pool(processes=num_workers) as pool: - file_html_source_tuples = list(tqdm(pool.imap_unordered(_process_file_with_source, process_args), total=len(process_args), desc="Processing files")) + results = list(tqdm( + pool.imap_unordered(_process_file_with_source, process_args), + total=len(process_args), + desc="Processing files" + )) else: - # Process files sequentially - file_html_source_tuples = [] + results = [] for f in tqdm(files, desc="Processing files"): - result = _process_file_with_source((f, source_configs)) - file_html_source_tuples.append(result) - - # Step 2: In serial mode, use the db object to skip articles that have been already added - # Filter out files with reading/validation errors - valid_files = [] - for f, html, source_name in file_html_source_tuples: - if html is not None: - valid_files.append((f, html, source_name)) - # We'll handle missing sources later when we actually parse the articles - - # Filter out articles that already exist in the database - files_to_process = [] - missing_sources = [] + results.append(_process_file_with_source((f, source_configs))) + # Filter valid files + valid_files = [(f, html, src) for f, html, src in results if html is not None] + + # Filter out existing articles if not overwriting + files_to_process = [] for f, html, source_name in valid_files: pmid = path.splitext(path.basename(f))[0] if pmid_filenames else None - # Check if article already exists - if pmid is not None and db.article_exists(pmid) and not config.OVERWRITE_EXISTING_ROWS: + # Check if article exists and should be skipped + if pmid and db.article_exists(pmid) and not get_config('OVERWRITE_EXISTING_ROWS'): continue files_to_process.append((f, html, source_name, pmid)) - # Step 3: Process remaining articles in parallel - # Prepare arguments for _parse_article - parse_args = [(f, html, source_name, pmid, manager, metadata_dir, force_ingest, kwargs) - for f, html, source_name, pmid in files_to_process] + # Step 2: Parse articles + parse_args = [ + (f, html, source_name, pmid, manager, metadata_dir, force_ingest, kwargs) + for f, html, source_name, pmid in files_to_process + ] if num_workers is not None and num_workers != 1 and parse_args: - # Parse articles in parallel with mp.Pool(processes=num_workers) as pool: - parsed_articles = list(tqdm(pool.imap_unordered(_parse_article, parse_args), total=len(parse_args), desc="Parsing articles")) + parsed_articles = list(tqdm( + pool.imap_unordered(_parse_article, parse_args), + total=len(parse_args), + desc="Parsing articles" + )) else: - # Parse articles sequentially parsed_articles = [] for args in tqdm(parse_args, desc="Parsing articles"): parsed_articles.append(_parse_article(args)) # Add successfully parsed articles to database + missing_sources = [] for i, (f, article) in enumerate(parsed_articles): - if article in [None, False]: + if article is None: missing_sources.append(f) continue - if config.SAVE_ARTICLES_WITHOUT_ACTIVATIONS or article.tables: - # Check again if article exists and handle overwrite + if get_config('SAVE_ARTICLES_WITHOUT_ACTIVATIONS') or article.tables: pmid = path.splitext(path.basename(f))[0] if pmid_filenames else None - if pmid is not None and db.article_exists(pmid): - if config.OVERWRITE_EXISTING_ROWS: + if pmid and db.article_exists(pmid): + if get_config('OVERWRITE_EXISTING_ROWS'): db.delete_article(pmid) else: continue diff --git a/ace/tableparser.py b/ace/tableparser.py index a15c156..2d1f3f0 100644 --- a/ace/tableparser.py +++ b/ace/tableparser.py @@ -4,7 +4,7 @@ # import database import regex # Note: we're using features in the new regex module, not re! import logging -from . import config +from .config import get_config from .database import Activation, Table from collections import Counter, defaultdict @@ -227,7 +227,7 @@ def parse_table(data, html=None): table = Table() # Only store the original HTML if the global config allows it - if html is not None and config.SAVE_ORIGINAL_HTML: + if html is not None and get_config('SAVE_ORIGINAL_HTML'): table.input_html = html n_cols = data.n_cols @@ -287,7 +287,7 @@ def parse_table(data, html=None): if None in labels: labels = [str(l) for l in labels] msg = 'Failed to identify at least one column label: [%s]. Skipping table!' % ', '.join(labels) - if config.EXCLUDE_TABLES_WITH_MISSING_LABELS: + if get_config('EXCLUDE_TABLES_WITH_MISSING_LABELS'): logger.error(msg) return None else: