Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 16 additions & 6 deletions ace/__init__.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,43 @@
# emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# emacs: -*- mode: python-mode; py-indent-offset: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 sw=4 et:
"""ACE -- Automated Coordinate Extraction.
"""
__all__ = ["config", "ingest", "database", "datatable", "set_logging_level", "scrape", "sources", "tableparser", "tests", "__version__"]
__all__ = [
"config", "ingest", "database", "datatable", "set_logging_level",
"scrape", "sources", "tableparser", "tests", "__version__"
]

import logging
import sys
import os

from .version import __version__


def set_logging_level(level=None):
"""Set package-wide logging level

Args
level : Logging level constant from logging module (warning, error, info, etc.)
Args:
level: Logging level constant from logging module
(warning, error, info, etc.)
"""
if level is None:
level = os.environ.get('ACE_LOGLEVEL', 'warn')
logger.setLevel(getattr(logging, level.upper()))
return logger.getEffectiveLevel()


def _setup_logger(logger):
# Basic logging setup
console = logging.StreamHandler(sys.stdout)
console.setFormatter(logging.Formatter("%(levelname)-6s %(module)-7s %(message)s"))
formatter = logging.Formatter(
"%(levelname)-6s %(module)-7s %(message)s"
)
console.setFormatter(formatter)
logger.addHandler(console)
set_logging_level()


# Set up logger
logger = logging.getLogger("ace")
_setup_logger(logger)
150 changes: 123 additions & 27 deletions ace/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,33 +62,129 @@
SAVE_ORIGINAL_HTML = False




''' SCRAPING/PARSING SETTINGS '''
USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.201 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.118 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.201 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.118 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.207 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.60 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.60 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.58 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.58 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.128 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.207 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.6422.60 Safari/537.36'
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36', # noqa: E501
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36', # noqa: E501
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36', # noqa: E501
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36', # noqa: E501
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.201 Safari/537.36', # noqa: E501
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36', # noqa: E501
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36', # noqa: E501
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.118 Safari/537.36', # noqa: E501
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.201 Safari/537.36', # noqa: E501
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36', # noqa: E501
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36', # noqa: E501
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.118 Safari/537.36', # noqa: E501
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36', # noqa: E501
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.207 Safari/537.36', # noqa: E501
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.60 Safari/537.36', # noqa: E501
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36', # noqa: E501
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.60 Safari/537.36', # noqa: E501
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36', # noqa: E501
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36', # noqa: E501
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36', # noqa: E501
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.58 Safari/537.36', # noqa: E501
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.58 Safari/537.36', # noqa: E501
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.128 Safari/537.36', # noqa: E501
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.207 Safari/537.36', # noqa: E501
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.6422.60 Safari/537.36' # noqa: E501
]


class ConfigManager:
"""Manages runtime configuration settings for ACE"""

_defaults = {}
_overrides = {}

def __init__(self):
# Capture initial defaults
self._defaults = {
'SILENT_ERRORS': SILENT_ERRORS,
'SQL_ADAPTER': SQL_ADAPTER,
'SQLITE_URI': SQLITE_URI,
'MYSQL_USER': MYSQL_USER,
'MYSQL_PASSWORD': MYSQL_PASSWORD,
'MYSQL_DB': MYSQL_DB,
'SAVE_ARTICLES_WITHOUT_ACTIVATIONS':
SAVE_ARTICLES_WITHOUT_ACTIVATIONS,
'OVERWRITE_EXISTING_ROWS': OVERWRITE_EXISTING_ROWS,
'CAREFUL_PARSING': CAREFUL_PARSING,
'IGNORE_BAD_ROWS': IGNORE_BAD_ROWS,
'EXCLUDE_TABLES_WITH_MISSING_LABELS':
EXCLUDE_TABLES_WITH_MISSING_LABELS,
'USE_READABILITY': USE_READABILITY,
'SAVE_ORIGINAL_HTML': SAVE_ORIGINAL_HTML
}

def update(self, **kwargs):
"""Update configuration settings at runtime

Args:
**kwargs: Key-value pairs of configuration settings to update
"""
valid_keys = self._defaults.keys()
for key, value in kwargs.items():
if key in valid_keys:
self._overrides[key] = value
else:
raise ValueError(f"Invalid config key: {key}")

def reset(self, key=None):
"""Reset configuration to default values

Args:
key (str): Specific key to reset (reset all if None)
"""
if key:
if key in self._overrides:
del self._overrides[key]
else:
self._overrides = {}

def __getattr__(self, name):
if name in self._overrides:
return self._overrides[name]
elif name in self._defaults:
return self._defaults[name]
else:
raise AttributeError(f"Config setting {name} does not exist")


# Create global config manager instance
config_manager = ConfigManager()


# Proxy functions for easier access
def update_config(**kwargs):
"""Update configuration settings at runtime"""
config_manager.update(**kwargs)


def reset_config(key=None):
"""Reset configuration to default values"""
config_manager.reset(key)


def get_config(key):
"""Get current configuration value"""
return getattr(config_manager, key)


# Expose config settings through the manager
SILENT_ERRORS = config_manager.SILENT_ERRORS
SQL_ADAPTER = config_manager.SQL_ADAPTER
SQLITE_URI = config_manager.SQLITE_URI
MYSQL_USER = config_manager.MYSQL_USER
MYSQL_PASSWORD = config_manager.MYSQL_PASSWORD
MYSQL_DB = config_manager.MYSQL_DB
SAVE_ARTICLES_WITHOUT_ACTIVATIONS = \
config_manager.SAVE_ARTICLES_WITHOUT_ACTIVATIONS
OVERWRITE_EXISTING_ROWS = config_manager.OVERWRITE_EXISTING_ROWS
CAREFUL_PARSING = config_manager.CAREFUL_PARSING
IGNORE_BAD_ROWS = config_manager.IGNORE_BAD_ROWS
EXCLUDE_TABLES_WITH_MISSING_LABELS = \
config_manager.EXCLUDE_TABLES_WITH_MISSING_LABELS
USE_READABILITY = config_manager.USE_READABILITY
SAVE_ORIGINAL_HTML = config_manager.SAVE_ORIGINAL_HTML
Loading
Loading