microsoft · AdamAdham · Jan 25, 2026 · Jan 25, 2026 · Jan 25, 2026 · Jan 25, 2026
diff --git a/README.md b/README.md
@@ -222,16 +222,18 @@ The code trains models for 2 different sets of table extraction tasks:
 
 For a detailed description of these tasks and the models, please refer to the paper.
 
-To train, you need to ```cd``` to the ```src``` directory and specify: 1. the path to the dataset, 2. the task (detection or structure), and 3. the path to the config file, which contains the hyperparameters for the architecture and training.
+To train, you need to specify: 1. the path to the dataset, 2. the task (detection or structure), and 3. the path to the config file, which contains the hyperparameters for the architecture and training.
+
+**Note:** All commands below should be run from the **parent directory of the cloned repository** (eg: `parent_dir/table_transformer`).
 
 To train the detection model:
 ```
-python main.py --data_type detection --config_file detection_config.json --data_root_dir /path/to/detection_data
+python -m table_transformer.src.main --data_type detection --config_file detection_config.json --data_root_dir /path/to/detection_data
 ```
 
 To train the structure recognition model:
 ```
-python main.py --data_type structure --config_file structure_config.json --data_root_dir /path/to/structure_data
+python -m table_transformer.src.main --data_type structure --config_file structure_config.json --data_root_dir /path/to/structure_data
 ```
 
 ##  Evaluation
@@ -243,14 +245,16 @@ For more details on GriTS, please see our papers.
 
 To compute object detection metrics for the detection model:
 
+**Note:** All commands below should be run from the **parent directory of the cloned repository** (eg: `parent_dir/table_transformer`).
+
 ```
-python main.py --mode eval --data_type detection --config_file detection_config.json --data_root_dir /path/to/pascal_voc_detection_data --model_load_path /path/to/detection_model  
+python -m table_transformer.src.main --mode eval --data_type detection --config_file detection_config.json --data_root_dir /path/to/pascal_voc_detection_data --model_load_path /path/to/detection_model  
 ```
 
 To compute object detection and GriTS metrics for the structure recognition model:
 
 ```
-python main.py --mode eval --data_type structure --config_file structure_config.json --data_root_dir /path/to/pascal_voc_structure_data --model_load_path /path/to/structure_model --table_words_dir /path/to/json_table_words_data
+python -m table_transformer.src.main --mode eval --data_type structure --config_file structure_config.json --data_root_dir /path/to/pascal_voc_structure_data --model_load_path /path/to/structure_model --table_words_dir /path/to/json_table_words_data
 ```  
 
 Optionally you can add flags for things like controlling parallelization, saving detailed metrics, and saving visualizations:\

diff --git a/__init__.py b/__init__.py
diff --git a/detr/__init__.py b/detr/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
diff --git a/detr/datasets/coco.py b/detr/datasets/coco.py
@@ -11,7 +11,7 @@
 import torchvision
 from pycocotools import mask as coco_mask
 
-import datasets.transforms as T
+from . import transforms as T
 
 
 class CocoDetection(torchvision.datasets.CocoDetection):

diff --git a/detr/datasets/coco_eval.py b/detr/datasets/coco_eval.py
@@ -16,7 +16,7 @@
 from pycocotools.coco import COCO
 import pycocotools.mask as mask_util
 
-from util.misc import all_gather
+from ..util.misc import all_gather
 
 
 class CocoEvaluator(object):

diff --git a/detr/datasets/coco_panoptic.py b/detr/datasets/coco_panoptic.py
@@ -7,8 +7,8 @@
 from PIL import Image
 
 from panopticapi.utils import rgb2id
-from util.box_ops import masks_to_boxes
 
+from ..util.box_ops import masks_to_boxes
 from .coco import make_coco_transforms
 
 

diff --git a/detr/datasets/panoptic_eval.py b/detr/datasets/panoptic_eval.py
@@ -2,7 +2,7 @@
 import json
 import os
 
-import util.misc as utils
+from ..util import misc as utils
 
 try:
     from panopticapi.evaluation import pq_compute

diff --git a/detr/datasets/transforms.py b/detr/datasets/transforms.py
@@ -9,8 +9,8 @@
 import torchvision.transforms as T
 import torchvision.transforms.functional as F
 
-from util.box_ops import box_xyxy_to_cxcywh
-from util.misc import interpolate
+from ..util.box_ops import box_xyxy_to_cxcywh
+from ..util.misc import interpolate
 
 
 def crop(image, target, region):

diff --git a/detr/engine.py b/detr/engine.py
@@ -9,9 +9,9 @@
 
 import torch
 
-import util.misc as utils
-from datasets.coco_eval import CocoEvaluator
-from datasets.panoptic_eval import PanopticEvaluator
+from .util import misc as utils
+from .datasets.coco_eval import CocoEvaluator
+from .datasets.panoptic_eval import PanopticEvaluator
 
 
 def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module,

diff --git a/detr/engine_multi.py b/detr/engine_multi.py
@@ -9,9 +9,9 @@
 
 import torch
 
-import util.misc as utils
-from datasets.coco_eval import CocoEvaluator
-from datasets.panoptic_eval import PanopticEvaluator
+from .util import misc as utils
+from .datasets.coco_eval import CocoEvaluator
+from .datasets.panoptic_eval import PanopticEvaluator
 
 
 def train_one_epoch(model_list, criterion_list,

diff --git a/detr/hubconf.py b/detr/hubconf.py
@@ -1,11 +1,11 @@
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 import torch
 
-from models.backbone import Backbone, Joiner
-from models.detr import DETR, PostProcess
-from models.position_encoding import PositionEmbeddingSine
-from models.segmentation import DETRsegm, PostProcessPanoptic
-from models.transformer import Transformer
+from .models.backbone import Backbone, Joiner
+from .models.detr import DETR, PostProcess
+from .models.position_encoding import PositionEmbeddingSine
+from .models.segmentation import DETRsegm, PostProcessPanoptic
+from .models.transformer import Transformer
 
 dependencies = ["torch", "torchvision"]
 

diff --git a/detr/main.py b/detr/main.py
@@ -10,11 +10,11 @@
 import torch
 from torch.utils.data import DataLoader, DistributedSampler
 
-import datasets
-import util.misc as utils
-from datasets import build_dataset, get_coco_api_from_dataset
-from engine import evaluate, train_one_epoch
-from models import build_model
+from . import datasets
+from .util import misc as utils
+from .datasets import build_dataset, get_coco_api_from_dataset
+from .engine import evaluate, train_one_epoch
+from .models import build_model
 
 
 def get_args_parser():

diff --git a/detr/models/backbone.py b/detr/models/backbone.py
@@ -11,8 +11,7 @@
 from torchvision.models._utils import IntermediateLayerGetter
 from typing import Dict, List
 
-from util.misc import NestedTensor, is_main_process
-
+from ..util.misc import NestedTensor, is_main_process
 from .position_encoding import build_position_encoding
 
 

diff --git a/detr/models/detr.py b/detr/models/detr.py
@@ -6,8 +6,8 @@
 import torch.nn.functional as F
 from torch import nn
 
-from util import box_ops
-from util.misc import (NestedTensor, nested_tensor_from_tensor_list,
+from ..util import box_ops
+from ..util.misc import (NestedTensor, nested_tensor_from_tensor_list,
                        accuracy, get_world_size, interpolate,
                        is_dist_avail_and_initialized)
 

diff --git a/detr/models/detr_multi.py b/detr/models/detr_multi.py
@@ -6,11 +6,10 @@
 import torch.nn.functional as F
 from torch import nn
 
-from util import box_ops
-from util.misc import (NestedTensor, nested_tensor_from_tensor_list,
+from ..util import box_ops
+from ..util.misc import (NestedTensor, nested_tensor_from_tensor_list,
                        accuracy, get_world_size, interpolate,
                        is_dist_avail_and_initialized)
-
 from .backbone import build_backbone
 from .matcher import build_matcher
 from .segmentation import (DETRsegm, PostProcessPanoptic, PostProcessSegm,

diff --git a/detr/models/matcher.py b/detr/models/matcher.py
@@ -6,7 +6,7 @@
 from scipy.optimize import linear_sum_assignment
 from torch import nn
 
-from util.box_ops import box_cxcywh_to_xyxy, generalized_box_iou
+from ..util.box_ops import box_cxcywh_to_xyxy, generalized_box_iou
 
 
 class HungarianMatcher(nn.Module):

diff --git a/detr/models/position_encoding.py b/detr/models/position_encoding.py
@@ -6,7 +6,7 @@
 import torch
 from torch import nn
 
-from util.misc import NestedTensor
+from ..util.misc import NestedTensor
 
 
 class PositionEmbeddingSine(nn.Module):

diff --git a/detr/models/segmentation.py b/detr/models/segmentation.py
@@ -12,8 +12,8 @@
 from torch import Tensor
 from PIL import Image
 
-import util.box_ops as box_ops
-from util.misc import NestedTensor, interpolate, nested_tensor_from_tensor_list
+from ..util import box_ops
+from ..util.misc import NestedTensor, interpolate, nested_tensor_from_tensor_list
 
 try:
     from panopticapi.utils import id2rgb, rgb2id

diff --git a/detr/run_with_submitit.py b/detr/run_with_submitit.py
@@ -7,9 +7,10 @@
 import uuid
 from pathlib import Path
 
-import main as detection
 import submitit
 
+from . import main as detection
+
 
 def parse_args():
     detection_parser = detection.get_args_parser()

diff --git a/detr/test_all.py b/detr/test_all.py
@@ -4,12 +4,12 @@
 
 import torch
 
-from models.matcher import HungarianMatcher
-from models.position_encoding import PositionEmbeddingSine, PositionEmbeddingLearned
-from models.backbone import Backbone, Joiner, BackboneBase
-from util import box_ops
-from util.misc import nested_tensor_from_tensor_list
-from hubconf import detr_resnet50, detr_resnet50_panoptic
+from .models.matcher import HungarianMatcher
+from .models.position_encoding import PositionEmbeddingSine, PositionEmbeddingLearned
+from .models.backbone import Backbone, Joiner, BackboneBase
+from .util import box_ops
+from .util.misc import nested_tensor_from_tensor_list
+from .hubconf import detr_resnet50, detr_resnet50_panoptic
 
 # onnxruntime requires python 3.5 or above
 try:

diff --git a/docs/INFERENCE.md b/docs/INFERENCE.md
@@ -35,11 +35,11 @@ To run the pipelines, you need to provide config and model checkpoint files.
 
 For table detection you need:
 1. A detection model config JSON file
-2. A pre-trained detection model checkpoint file
+2. A pre-trained detection model checkpoint file (readMe section titled "Pre-trained Model Weights" then "Table Detection")
 
 For table structure recognition you need:
 1. A structure model config JSON file
-2. A pre-trained structure model checkpoint file
+2. A pre-trained structure model checkpoint file (readMe section titled "Pre-trained Model Weights" then "Table Structure Recognition")
 
 For end-to-end table extraction you need all four of the above files.
 
@@ -64,13 +64,15 @@ When running the sample code:
 ```
 where `bbox` is in `[xmin, ymin, xmax, ymax]` format.
 ## Running From the Command Line
-Change to `src` directory:
-```
-cd src
+Change directory to parent of the cloned repository:
+
+```bash
+cd parent_dir
 ```
+
 ### To run table detection on a folder of document page images:
 ```
-python inference.py --mode detect --detection_config_path detection_config.json --detection_model_path ../pubtables1m_detection_detr_r18.pth --detection_device cuda --image_dir [PATH TO DOCUMENT PAGE IMAGES] --words_dir [OPTIONAL PATH TO WORDS (ex. OCR) EXTRACTED FROM DOCUMENT PAGE IMAGES] --out_dir [PATH TO SAVE DETECTION OUTPUT] [FLAGS: -o,-z,-v,-p] --crop_padding 20
+python -m table_transformer.src.inference --mode detect --detection_config_path table_transformer/src/detection_config.json --detection_model_path ../pubtables1m_detection_detr_r18.pth --detection_device cuda --image_dir [PATH TO DOCUMENT PAGE IMAGES] --words_dir [OPTIONAL PATH TO WORDS (ex. OCR) EXTRACTED FROM DOCUMENT PAGE IMAGES] --out_dir [PATH TO SAVE DETECTION OUTPUT] [FLAGS: -o,-z,-v,-p] --crop_padding 20
 ```
 where:
 - `--words_dir` is optional. But text content cannot be included in the final result without it.
@@ -82,20 +84,27 @@ where:
 
 ### To run table structure recognition on a folder of cropped table images:
 ```
-python inference.py --mode recognize --structure_config_path structure_config.json --structure_model_path ../pubtables1m_structure_detr_r18.pth --structure_device cuda --image_dir [PATH TO CROPPED TABLE IMAGES] --words_dir [OPTIONAL PATH TO WORDS (ex. OCR) EXTRACTED FROM CROPPED TABLE IMAGES] --out_dir [PATH TO SAVE DETECTION OUTPUT] [FLAGS: -o,-c,-l,-m,-z,-v]
+python -m table_transformer.src.inference --mode recognize --structure_config_path table_transformer/src/detection_config.json --structure_model_path ../pubtables1m_structure_detr_r18.pth --structure_device cuda --image_dir [PATH TO CROPPED TABLE IMAGES] --words_dir [OPTIONAL PATH TO WORDS (ex. OCR) EXTRACTED FROM CROPPED TABLE IMAGES] --out_dir [PATH TO SAVE DETECTION OUTPUT] [FLAGS: -o,-c,-l,-m,-z,-v]
  ```
  where:
  - `--words_dir` is optional. But text content cannot be included in the final result without it. You can still use the visualize flag (-z) to see a depiction of the recognized table structure even without supplying text as input.
-- `-o` means to output detected objects (with bounding boxes)
-- `-l` means to output a list of recognized cells and their properties
-- `-m` means to output the recognized table in HTML format
-- `-c` means to output the recognized table in CSV format
-- `-v` means to print (verbose) output to the console
-- `-z` means to create and save (visualize) figures depicting the recognized tables and recognized cells in the tables
+ - `-o`: Output detected objects with bounding boxes (`args.objects`).
+ - `-l`: Output a list of recognized cells and their properties (`args.cells`).
+ - `-m`: Output the recognized table in **HTML** format (`args.html`).
+ -  `-c`: Output the recognized table in **CSV** format (`args.csv`).
+ -  `-v`: Print detailed (verbose) output to the console (`args.verbose`).
+ -  `-z`: Create and save figures visualizing the recognized tables and cells (`args.visualize`).
 
  ### To run table extraction (detection and recognition combined end-to-end) on a folder of document page images:
 ```
-python inference.py --mode extract --detection_config_path detection_config.json --detection_model_path ../pubtables1m_detection_detr_r18.pth --detection_device cuda --structure_config_path structure_config.json --structure_model_path ../pubtables1m_structure_detr_r18.pth --structure_device cuda --image_dir [PATH TO DOCUMENT PAGE IMAGES] --words_dir [OPTIONAL PATH TO WORDS (ex. OCR) EXTRACTED FROM DOCUMENT PAGE IMAGES] --out_dir [PATH TO SAVE DETECTION OUTPUT] [FLAGS: -o,-c,-l,-m,-z,-v,-p] --crop_padding 20
+python -m table_transformer.src.inference --mode extract --detection_config_path detection_config.json --detection_model_path table_transformer/models/pubtables1m_detection_detr_r18.pth --detection_device cuda --structure_config_path structure_config.json --structure_model_path table_transformer/models/pubtables1m_structure_detr_r18.pth --structure_device cuda --image_dir [PATH TO DOCUMENT PAGE IMAGES] --words_dir [OPTIONAL PATH TO WORDS (ex. OCR) EXTRACTED FROM DOCUMENT PAGE IMAGES] --out_dir [PATH TO SAVE DETECTION OUTPUT] [FLAGS: -o,-c,-l,-m,-z,-v,-p] --crop_padding 20
 ```
 where:
  - `--words_dir` is optional. But text content cannot be included in the final result without it. You can still use the visualize flag (-z) to see a depiction of the recognized table structure even without supplying text as input.
+
+## ⚠️ Important: Image Format
+PNG images with transparency (RGBA - 4 channels) will cause errors. Convert images to RGB format (3 channels) before processing:
+- Use JPEG/JPG format, OR
+- Convert PNG to RGB (remove alpha/transparency channel)
+
+The model expects 3-channel RGB images only.
diff --git a/docs/INSTALL_AS_PACKAGE.md b/docs/INSTALL_AS_PACKAGE.md
@@ -0,0 +1,62 @@
+# How to Import table-transformer As a Package
+This file provides exact instructions on how to use table-transformer as a package.
+
+## Directory Setup
+1. Create a directory `parent_directory` (e.g., `table_transformer_parent`)
+2. Clone the repository inside this directory, such that the structure is:
+
+```
+table_transformer_parent/table_transformer/
+```
+
+3. Create `parent_directory/pyproject.toml`.
+
+### `pyproject.toml`
+
+```toml
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "table_transformer"
+version = "0.1.0"
+description = "Local version of table-transformer"
+authors = [
+ { name="Your Name", email="you@example.com" }
+]
+dependencies = []
+
+[tool.setuptools.packages.find]
+where = ["."]
+````
+
+## How to Install
+
+Activate your `tables-detr` conda environment, then run:
+
+```bash
+pip install path/to/parent_directory
+```
+
+## Example Use
+
+```python
+from table_transformer.src.inference import TableExtractionPipeline
+
+# Example paths and devices (adjust to your setup)
+pipeline = TableExtractionPipeline(
+        det_device=DEVICES["det"],
+        str_device=DEVICES["str"],
+        det_config_path=paths["det_config"],
+        det_model_path=paths["det_model"],
+        str_config_path=paths["str_config"],
+        str_model_path=paths["str_model"],
+    )
+
+pix = page.get_pixmap(dpi=200)
+image = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
+
+# Run detection
+detect = pipeline.detect(image, [])
+```
diff --git a/src/__init__.py b/src/__init__.py
diff --git a/src/eval.py b/src/eval.py
@@ -21,12 +21,10 @@
 from fitz import Rect
 from PIL import Image
 
-sys.path.append("../detr")
-import util.misc as utils
-from datasets.coco_eval import CocoEvaluator
-import postprocess
-import grits
-from grits import grits_con, grits_top, grits_loc
+from ..detr.util import misc as utils
+from ..detr.datasets.coco_eval import CocoEvaluator
+from . import postprocess, grits 
+from .grits import grits_con, grits_top, grits_loc
 
 
 structure_class_names = [
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved