Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions ace/.vscode/settings.json

This file was deleted.

7 changes: 7 additions & 0 deletions ace/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,13 @@
# anyway, so this should be left off unless problems arise.
EXCLUDE_TABLES_WITH_MISSING_LABELS = False

# Whether to use readability.py for HTML cleaning when available.
# When False, will use fallback HTML processing by default.
USE_READABILITY = True

# Whether to save the original HTML of the table in the Table object
SAVE_ORIGINAL_HTML = False




Expand Down
1 change: 1 addition & 0 deletions ace/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ class Table(Base):
notes = Column(Text)
n_activations = Column(Integer)
n_columns = Column(Integer)
input_html = Column(LongText)

def finalize(self):
''' Any cleanup and updating operations we need to do before saving. '''
Expand Down
14 changes: 11 additions & 3 deletions ace/datatable.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,17 @@ def n_rows(self):
def add_val(self, val, rows=1, cols=1):
''' Find next open position and add values to grid '''

# Flatten list and find next open position
flat = [item for l in self.data for item in l]
flat_set = set(flat)
flat = []
for row in self.data:
# If row is not a list for some reason, treat as single-item row
if isinstance(row, list):
for item in row:
flat.append(item)
else:
flat.append(row)

# Only include hashable items in the set (skip unhashable like lists)
flat_set = set(x for x in flat if not isinstance(x, list))

if not None in flat_set:
open_pos = self.n_rows * self.n_cols
Expand Down
16 changes: 15 additions & 1 deletion ace/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,4 +98,18 @@ def export_database(db, foldername, skip_empty=True):
}

with (foldername / 'export.json').open('w') as f:
json.dump(export_md, f)
json.dump(export_md, f)

# Save table HTML files if available
tables_dir = foldername / 'tables'
tables_dir.mkdir(parents=True, exist_ok=True)

for art in articles:
art_dir = tables_dir / str(art.id)
art_dir.mkdir(parents=True, exist_ok=True)

for t in art.tables:
if t.input_html:
table_file = art_dir / f"{t.id}.html"
with table_file.open('w', encoding='utf-8') as f:
f.write(t.input_html)
41 changes: 14 additions & 27 deletions ace/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,29 +5,13 @@
from .scrape import _validate_scrape
import multiprocessing as mp
from functools import partial
from tqdm import tqdm

logger = logging.getLogger(__name__)

def _process_file(f):
"""Helper function to read and validate a single file."""
logger.info("Processing article %s..." % f)
try:
html = open(f).read()
except Exception as e:
logger.warning("Failed to read file %s: %s" % (f, str(e)))
return f, None

if not _validate_scrape(html):
logger.warning("Invalid HTML for %s" % f)
return f, None

return f, html


def _process_file_with_source(args):
"""Helper function to read, validate, and identify source for a single file."""
f, source_configs = args
logger.info("Processing article %s..." % f)
try:
html = open(f).read()
except Exception as e:
Expand Down Expand Up @@ -65,18 +49,18 @@ def _parse_article(args):
# Fallback to original source identification
source = manager.identify_source(html)
if source is None:
logger.warning("Could not identify source for %s" % f)
logger.info("Could not identify source for %s" % f)
return f, None

article = source.parse_article(html, pmid, metadata_dir=metadata_dir, **kwargs)
return f, article
except Exception as e:
logger.warning("Error parsing article %s: %s" % (f, str(e)))
logger.info("Error parsing article %s: %s" % (f, str(e)))
return f, None


def add_articles(db, files, commit=True, table_dir=None, limit=None,
pmid_filenames=False, metadata_dir=None, force_ingest=True, num_workers=None, **kwargs):
pmid_filenames=False, metadata_dir=None, force_ingest=True, num_workers=None, use_readability=None, **kwargs):
''' Process articles and add their data to the DB.
Args:
files: The path to the article(s) to process. Can be a single
Expand All @@ -100,10 +84,13 @@ def add_articles(db, files, commit=True, table_dir=None, limit=None,
force_ingest: Ingest even if no source is identified.
num_workers: Number of worker processes to use when processing in parallel.
If None (default), uses the number of CPUs available on the system.
use_readability: When True, use readability.py for HTML cleaning if available.
When False, use fallback HTML processing by default. If None (default),
uses the value from config.USE_READABILITY.
kwargs: Additional keyword arguments to pass to parse_article.
'''

manager = sources.SourceManager(table_dir)
manager = sources.SourceManager(table_dir, use_readability=use_readability if use_readability is not None else config.USE_READABILITY)

# Prepare source configurations for parallel processing
source_configs = {name: source.identifiers for name, source in manager.sources.items()}
Expand All @@ -123,11 +110,11 @@ def add_articles(db, files, commit=True, table_dir=None, limit=None,
# Process files in parallel to extract HTML content and identify sources
process_args = [(f, source_configs) for f in files]
with mp.Pool(processes=num_workers) as pool:
file_html_source_tuples = pool.map(_process_file_with_source, process_args)
file_html_source_tuples = list(tqdm(pool.imap_unordered(_process_file_with_source, process_args), total=len(process_args), desc="Processing files"))
else:
# Process files sequentially
file_html_source_tuples = []
for f in files:
for f in tqdm(files, desc="Processing files"):
result = _process_file_with_source((f, source_configs))
file_html_source_tuples.append(result)

Expand All @@ -142,7 +129,7 @@ def add_articles(db, files, commit=True, table_dir=None, limit=None,
# Filter out articles that already exist in the database
files_to_process = []
missing_sources = []

for f, html, source_name in valid_files:
pmid = path.splitext(path.basename(f))[0] if pmid_filenames else None

Expand All @@ -160,16 +147,16 @@ def add_articles(db, files, commit=True, table_dir=None, limit=None,
if num_workers is not None and num_workers != 1 and parse_args:
# Parse articles in parallel
with mp.Pool(processes=num_workers) as pool:
parsed_articles = pool.map(_parse_article, parse_args)
parsed_articles = list(tqdm(pool.imap_unordered(_parse_article, parse_args), total=len(parse_args), desc="Parsing articles"))
else:
# Parse articles sequentially
parsed_articles = []
for args in parse_args:
for args in tqdm(parse_args, desc="Parsing articles"):
parsed_articles.append(_parse_article(args))

# Add successfully parsed articles to database
for i, (f, article) in enumerate(parsed_articles):
if article is None:
if article in [None, False]:
missing_sources.append(f)
continue

Expand Down
Loading
Loading