neurosynth · adelavega · Oct 23, 2025 · Oct 17, 2025 · Oct 18, 2025 · Oct 18, 2025
diff --git a/ace/.vscode/settings.json b/ace/.vscode/settings.json
diff --git a/ace/config.py b/ace/config.py
@@ -54,6 +54,13 @@
 # anyway, so this should be left off unless problems arise.
 EXCLUDE_TABLES_WITH_MISSING_LABELS = False
 
+# Whether to use readability.py for HTML cleaning when available.
+# When False, will use fallback HTML processing by default.
+USE_READABILITY = True
+
+# Whether to save the original HTML of the table in the Table object
+SAVE_ORIGINAL_HTML = False
+
 
 
 

diff --git a/ace/database.py b/ace/database.py
@@ -169,6 +169,7 @@ class Table(Base):
     notes = Column(Text)
     n_activations = Column(Integer)
     n_columns = Column(Integer)
+    input_html = Column(LongText)
 
     def finalize(self):
         ''' Any cleanup and updating operations we need to do before saving. '''

diff --git a/ace/datatable.py b/ace/datatable.py
@@ -32,9 +32,17 @@ def n_rows(self):
     def add_val(self, val, rows=1, cols=1):
         ''' Find next open position and add values to grid '''
 
-        # Flatten list and find next open position
-        flat = [item for l in self.data for item in l]
-        flat_set = set(flat)
+        flat = []
+        for row in self.data:
+            # If row is not a list for some reason, treat as single-item row
+            if isinstance(row, list):
+                for item in row:
+                    flat.append(item)
+            else:
+                flat.append(row)
+
+        # Only include hashable items in the set (skip unhashable like lists)
+        flat_set = set(x for x in flat if not isinstance(x, list))
 
         if not None in flat_set:
             open_pos = self.n_rows * self.n_cols

diff --git a/ace/export.py b/ace/export.py
@@ -98,4 +98,18 @@ def export_database(db, foldername, skip_empty=True):
     }
 
     with (foldername / 'export.json').open('w') as f:
-        json.dump(export_md, f)
+        json.dump(export_md, f)
+
+    # Save table HTML files if available
+    tables_dir = foldername / 'tables'
+    tables_dir.mkdir(parents=True, exist_ok=True)
+
+    for art in articles:
+        art_dir = tables_dir / str(art.id)
+        art_dir.mkdir(parents=True, exist_ok=True)
+
+        for t in art.tables:
+            if t.input_html:
+                table_file = art_dir / f"{t.id}.html"
+                with table_file.open('w', encoding='utf-8') as f:
+                    f.write(t.input_html)
diff --git a/ace/ingest.py b/ace/ingest.py
@@ -5,29 +5,13 @@
 from .scrape import _validate_scrape
 import multiprocessing as mp
 from functools import partial
+from tqdm import tqdm
 
 logger = logging.getLogger(__name__)
 
-def _process_file(f):
-    """Helper function to read and validate a single file."""
-    logger.info("Processing article %s..." % f)
-    try:
-        html = open(f).read()
-    except Exception as e:
-        logger.warning("Failed to read file %s: %s" % (f, str(e)))
-        return f, None
-
-    if not _validate_scrape(html):
-        logger.warning("Invalid HTML for %s" % f)
-        return f, None
-
-    return f, html
-
-
 def _process_file_with_source(args):
     """Helper function to read, validate, and identify source for a single file."""
     f, source_configs = args
-    logger.info("Processing article %s..." % f)
     try:
         html = open(f).read()
     except Exception as e:
@@ -65,18 +49,18 @@ def _parse_article(args):
             # Fallback to original source identification
             source = manager.identify_source(html)
             if source is None:
-                logger.warning("Could not identify source for %s" % f)
+                logger.info("Could not identify source for %s" % f)
                 return f, None
 
         article = source.parse_article(html, pmid, metadata_dir=metadata_dir, **kwargs)
         return f, article
     except Exception as e:
-        logger.warning("Error parsing article %s: %s" % (f, str(e)))
+        logger.info("Error parsing article %s: %s" % (f, str(e)))
         return f, None
 
 
 def add_articles(db, files, commit=True, table_dir=None, limit=None,
-    pmid_filenames=False, metadata_dir=None, force_ingest=True, num_workers=None, **kwargs):
+    pmid_filenames=False, metadata_dir=None, force_ingest=True, num_workers=None, use_readability=None, **kwargs):
     ''' Process articles and add their data to the DB.
     Args:
         files: The path to the article(s) to process. Can be a single
@@ -100,10 +84,13 @@ def add_articles(db, files, commit=True, table_dir=None, limit=None,
         force_ingest: Ingest even if no source is identified.
         num_workers: Number of worker processes to use when processing in parallel.
             If None (default), uses the number of CPUs available on the system.
+        use_readability: When True, use readability.py for HTML cleaning if available.
+            When False, use fallback HTML processing by default. If None (default),
+            uses the value from config.USE_READABILITY.
         kwargs: Additional keyword arguments to pass to parse_article.
     '''
 
-    manager = sources.SourceManager(table_dir)
+    manager = sources.SourceManager(table_dir, use_readability=use_readability if use_readability is not None else config.USE_READABILITY)
 
     # Prepare source configurations for parallel processing
     source_configs = {name: source.identifiers for name, source in manager.sources.items()}
@@ -123,11 +110,11 @@ def add_articles(db, files, commit=True, table_dir=None, limit=None,
         # Process files in parallel to extract HTML content and identify sources
         process_args = [(f, source_configs) for f in files]
         with mp.Pool(processes=num_workers) as pool:
-            file_html_source_tuples = pool.map(_process_file_with_source, process_args)
+            file_html_source_tuples = list(tqdm(pool.imap_unordered(_process_file_with_source, process_args), total=len(process_args), desc="Processing files"))
     else:
         # Process files sequentially
         file_html_source_tuples = []
-        for f in files:
+        for f in tqdm(files, desc="Processing files"):
             result = _process_file_with_source((f, source_configs))
             file_html_source_tuples.append(result)
 
@@ -142,7 +129,7 @@ def add_articles(db, files, commit=True, table_dir=None, limit=None,
     # Filter out articles that already exist in the database
     files_to_process = []
     missing_sources = []
-    
+
     for f, html, source_name in valid_files:
         pmid = path.splitext(path.basename(f))[0] if pmid_filenames else None
 
@@ -160,16 +147,16 @@ def add_articles(db, files, commit=True, table_dir=None, limit=None,
     if num_workers is not None and num_workers != 1 and parse_args:
         # Parse articles in parallel
         with mp.Pool(processes=num_workers) as pool:
-            parsed_articles = pool.map(_parse_article, parse_args)
+            parsed_articles = list(tqdm(pool.imap_unordered(_parse_article, parse_args), total=len(parse_args), desc="Parsing articles"))
     else:
         # Parse articles sequentially
         parsed_articles = []
-        for args in parse_args:
+        for args in tqdm(parse_args, desc="Parsing articles"):
             parsed_articles.append(_parse_article(args))
 
     # Add successfully parsed articles to database
     for i, (f, article) in enumerate(parsed_articles):
-        if article is None:
+        if article in [None, False]:
             missing_sources.append(f)
             continue