diff --git a/package-lock.json b/package-lock.json
index 8a23ca84..40eb5db9 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,5 +1,5 @@
{
- "name": "mundane-assignment-police",
+ "name": "map",
"lockfileVersion": 2,
"requires": true,
"packages": {}
diff --git a/pom.xml b/pom.xml
index c6177209..05953c1a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -15,7 +15,7 @@
Web-app that assists in checking students' assignments
11
- 1.5.31
+ 1.6.21
@@ -85,6 +85,11 @@
5.0.0.M1
test
+
+ org.jetbrains.kotlinx
+ dataframe
+ 0.8.0-dev-1005
+
diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/RuleViolation.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/RuleViolation.kt
index 1782c96e..fd37b0e1 100644
--- a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/RuleViolation.kt
+++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/RuleViolation.kt
@@ -13,3 +13,4 @@ data class RuleViolation(
) {
// override fun toString() = if (lines.count() == 1) "[${lines.first().line}, p.${lines.first().page}] --> '$message'" else ""
}
+
diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/list/ListRule.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/list/ListRule.kt
index e8e2a712..ec2f7b2b 100644
--- a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/list/ListRule.kt
+++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/list/ListRule.kt
@@ -8,6 +8,7 @@ import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFArea.TABLE_OF
import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFDocument
import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFRegion
import com.github.darderion.mundaneassignmentpolice.pdfdocument.list.PDFList
+import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.Coordinate
import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.Line
class ListRule(
@@ -26,7 +27,7 @@ class ListRule(
document.areas!!.tableOfContents.map {
document.text.filter { it.area == TABLE_OF_CONTENT }.firstOrNull { line ->
line.content.contains(it)
- }?: Line(0, 0, 0, listOf(), TABLE_OF_CONTENT)
+ }?: Line(0, 0, 0, listOf(), TABLE_OF_CONTENT, Coordinate(0,0))
}
)
diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/symbol/BasicSymbolRule.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/symbol/BasicSymbolRule.kt
index 41b5405c..c1414fe3 100644
--- a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/symbol/BasicSymbolRule.kt
+++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/symbol/BasicSymbolRule.kt
@@ -39,6 +39,7 @@ class BasicSymbolRule(
when (direction) {
LEFT -> sideTexts.removeAt(1)
RIGHT -> sideTexts.removeAt(0)
+ else -> {}
}
val neighbors = (if (notIgnoredNeighbors.isNotEmpty()) sideTexts
diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/table/TableRule.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/table/TableRule.kt
new file mode 100644
index 00000000..697401e4
--- /dev/null
+++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/table/TableRule.kt
@@ -0,0 +1,32 @@
+package com.github.darderion.mundaneassignmentpolice.checker.rule.table
+
+import com.github.darderion.mundaneassignmentpolice.checker.RuleViolation
+import com.github.darderion.mundaneassignmentpolice.checker.RuleViolationType
+import com.github.darderion.mundaneassignmentpolice.checker.rule.Rule
+import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFDocument
+import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFRegion
+import com.github.darderion.mundaneassignmentpolice.pdfdocument.tables.Table
+import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.Line
+
+class TableRule (
+ val predicates: MutableList<(Table) -> List>,
+ type: RuleViolationType,
+ area: PDFRegion,
+ name: String
+ ): Rule(area, name, type){
+ override fun process(document: PDFDocument): List {
+ val rulesViolations: MutableSet = mutableSetOf()
+
+ predicates.forEach { predicate ->
+ rulesViolations.addAll(
+ document.tables.map {
+ predicate(it)
+ }.filter { it.isNotEmpty() }.map {
+ RuleViolation(it, name, type)
+ }
+ )
+ }
+
+ return rulesViolations.toList()
+ }
+ }
diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/table/TableRuleBuilder.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/table/TableRuleBuilder.kt
new file mode 100644
index 00000000..c9acd21f
--- /dev/null
+++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/table/TableRuleBuilder.kt
@@ -0,0 +1,18 @@
+package com.github.darderion.mundaneassignmentpolice.checker.rule.table
+
+import com.github.darderion.mundaneassignmentpolice.checker.RuleViolationType
+import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFRegion
+import com.github.darderion.mundaneassignmentpolice.pdfdocument.tables.Table
+import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.Line
+
+class TableRuleBuilder {
+ private val predicates: MutableList<(Table) -> List> = mutableListOf()
+ private var type: RuleViolationType = RuleViolationType.Error
+ private var region: PDFRegion = PDFRegion.EVERYWHERE
+ private var name: String = "Rule name"
+
+ fun called(name: String) = this.also { this.name = name }
+
+ fun disallow(predicate: (table: Table) -> List) = this.also { predicates.add(predicate) }
+ fun getRule() = TableRule(predicates, type, region, name)
+}
\ No newline at end of file
diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/word/BasicWordRule.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/word/BasicWordRule.kt
index e5310909..e0f0ddd0 100644
--- a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/word/BasicWordRule.kt
+++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/word/BasicWordRule.kt
@@ -39,6 +39,7 @@ class BasicWordRule(
when (direction) {
Direction.LEFT -> sideWords.removeAt(1)
Direction.RIGHT -> sideWords.removeAt(0)
+ else -> {}
}
val filteredSideWords = sideWords
diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/Annotations.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/Annotations.kt
index 721b9d2e..edc86d07 100644
--- a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/Annotations.kt
+++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/Annotations.kt
@@ -13,10 +13,12 @@ class Annotations {
var document = PDFBox().getDocument(pdf.name)
lines.forEach { line ->
document = PDFBox().addLine(document, line.page,
- Coordinate(line.position.x to (pdf.height - (line.text.maxOf { it.position.y } + 2))),
- (pdf.width - (line.position.x + 50)).toInt()
+ Coordinate(line.startPosition.x to (pdf.height - (line.text.maxOf { it.position.y } + 2))),
+ (line.endPosition.x - line.startPosition.x).toInt()
)
}
+
+
Files.createDirectories(Paths.get("${pdfFolder}ruleviolations/"))
val fileName = "${pdfFolder}ruleviolations/${
pdf.name.split('/')[pdf.name.split('/').count() - 1].replace(".pdf", "")
@@ -24,5 +26,6 @@ class Annotations {
document.save(fileName)
return fileName
}
+
}
}
diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/PDFDocument.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/PDFDocument.kt
index a7c7e71b..e499ddd5 100644
--- a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/PDFDocument.kt
+++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/PDFDocument.kt
@@ -1,10 +1,12 @@
package com.github.darderion.mundaneassignmentpolice.pdfdocument
+import com.github.darderion.mundaneassignmentpolice.pdfdocument.tables.Table
import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.Line
import mu.KotlinLogging
class PDFDocument(val name: String = "PDF",
val text: List,
+ val tables: List,
val width: Double = defaultPageWidth,
val height: Double = defaultPageHeight
) {
diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/list/PDFList.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/list/PDFList.kt
index c55826eb..3ce27e68 100644
--- a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/list/PDFList.kt
+++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/list/PDFList.kt
@@ -54,7 +54,7 @@ data class PDFList(val value: MutableList = mutableListOf(), val nodes: Mu
*/
fun getLists(lines: List): List> {
// Adding a line to process a text that has no lines after a list
- val lines = lines + Line(-1, -1, -1, listOf(Word("NOT A LIST ITEM", Font(0.0f), Coordinate(1000, -1))))
+ val lines = lines + Line(-1, -1, -1, listOf(Word("NOT A LIST ITEM", Font(0.0f), Coordinate(1000, -1))), null, Coordinate(0,0))
val lists: MutableList> = mutableListOf()
val stack: Stack> = Stack()
@@ -69,11 +69,11 @@ data class PDFList(val value: MutableList = mutableListOf(), val nodes: Mu
stack.push(stack.peek().nodes.first())
}
} else {
- previousPosition = stack.peek().value.first().position
- if (previousPosition hasSameXAs line.position) { // 1. lorem OR lorem
+ previousPosition = stack.peek().value.first().startPosition
+ if (previousPosition hasSameXAs line.startPosition) { // 1. lorem OR lorem
stack.peek().value.add(line) // lorem lorem
} else {
- if (previousPosition.x < line.position.x) {
+ if (previousPosition.x < line.startPosition.x) {
if (isListItem(line)) {
stack.peek().nodes.add(PDFList(line.drop(2))) // lorem
stack.push(stack.peek().nodes.last()) // 1. lorem
@@ -83,17 +83,17 @@ data class PDFList(val value: MutableList = mutableListOf(), val nodes: Mu
}
} else { // lorem OR lorem OR ... lorem OR ... lorem
while (!( stack.isEmpty() || // lorem 2. lorem lorem 2. lorem
- (isListItem(line) && previousPosition hasSameXAs line.drop(2).position) ||
- previousPosition hasSameXAs line.position)) {
+ (isListItem(line) && previousPosition hasSameXAs line.drop(2).startPosition) ||
+ previousPosition hasSameXAs line.startPosition)) {
previousList = stack.pop()
if (stack.isNotEmpty()) {
- previousPosition = stack.peek().value.first().position
+ previousPosition = stack.peek().value.first().startPosition
}
}
if (stack.isEmpty()) {
lists.add(previousList!!)
} else {
- if (previousPosition hasSameXAs line.position) { // lorem
+ if (previousPosition hasSameXAs line.startPosition) { // lorem
stack.peek().value.add(line) // lorem
} else {
stack.pop()
diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/tables/Cell.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/tables/Cell.kt
new file mode 100644
index 00000000..880cd0ac
--- /dev/null
+++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/tables/Cell.kt
@@ -0,0 +1,11 @@
+package com.github.darderion.mundaneassignmentpolice.pdfdocument.tables
+import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.Coordinate
+import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.Line
+
+data class Cell(
+ val page: Int,
+ val cellText: MutableList,
+ var cellLines: MutableList,
+ val leftCorner: Coordinate,
+ val rightCorner: Coordinate
+)
\ No newline at end of file
diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/tables/Table.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/tables/Table.kt
new file mode 100644
index 00000000..f41c97cc
--- /dev/null
+++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/tables/Table.kt
@@ -0,0 +1,69 @@
+package com.github.darderion.mundaneassignmentpolice.pdfdocument.tables
+
+import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.*
+import com.github.darderion.mundaneassignmentpolice.wrapper.PDFBox
+import org.jetbrains.kotlinx.dataframe.AnyFrame
+import org.jetbrains.kotlinx.dataframe.DataFrame
+import org.jetbrains.kotlinx.dataframe.api.*
+
+class Table(val df: DataFrame){
+
+ val page : Int
+ val x1 : Double
+ val y1 : Double
+ val x2 : Double
+ val y2 : Double
+ val rowCount : Int
+ val colCount : Int
+ val cells: MutableList = mutableListOf()
+ init {
+ val indexTableInf = df.select{ cols(0) }.last { it[0] == "table information"}.index()
+ val tableInf = df.select{cols(0)}.filter { it.index() >= indexTableInf }
+
+ this.page = tableInf[pageTableIndex][0].toString().toInt() - 1
+ this.x1 = tableInf[x1TableIndex][0].toString().toDouble()
+ this.y1 = defaultPageHeight - tableInf[y1TableIndex][0].toString().toDouble()
+ this.x2 = tableInf[x2TableIndex][0].toString().toDouble()
+ this.y2 = defaultPageHeight - tableInf[y2TableIndex][0].toString().toDouble()
+ this.rowCount = tableInf[rowTableIndex][0].toString().toInt()
+ this.colCount = tableInf[colTableIndex][0].toString().toInt()
+ val tableData = df.filter { it.index() < indexTableInf }
+
+ tableData.forEachColumn { it.forEach { getCell(it.toString()) } }
+ }
+
+ private fun getCell(text: String){
+
+ val coordinates = text.lines().first().split(" ")
+ val x1 = coordinates[x1CellIndex].toDouble()
+ val y1 = defaultPageHeight - coordinates[y1CellIndex].toDouble()
+ val x2 = coordinates[x2CellIndex].toDouble()
+ val y2 = defaultPageHeight - coordinates[y2CellIndex].toDouble()
+
+ val cellText = text.lines().filterIndexed{ index, _ -> index > 0 }.toMutableList()
+
+ cells.add(Cell(page, cellText, mutableListOf(), Coordinate(x1,y1), Coordinate(x2,y2)))
+ }
+
+ fun getLines(): List{
+ val lines = mutableListOf()
+ cells.forEach{ lines.addAll(it.cellLines) }
+ return lines
+ }
+
+ companion object {
+ private const val defaultPageHeight = 842.0
+ private const val x1CellIndex = 2
+ private const val y1CellIndex = 5
+ private const val x2CellIndex = 8
+ private const val y2CellIndex = 11
+
+ private const val pageTableIndex = 2
+ private const val x1TableIndex = 4
+ private const val y1TableIndex = 5
+ private const val x2TableIndex = 6
+ private const val y2TableIndex = 7
+ private const val rowTableIndex = 9
+ private const val colTableIndex = 11
+ }
+}
diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/text/Line.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/text/Line.kt
index a7003cc6..021e112a 100644
--- a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/text/Line.kt
+++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/text/Line.kt
@@ -3,12 +3,12 @@ package com.github.darderion.mundaneassignmentpolice.pdfdocument.text
import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFArea
data class Line(val index: Int, val page: Int, val documentIndex: Int,
- val text: List, var area: PDFArea? = null
+ val text: List, var area: PDFArea? = null, var endPosition: Coordinate
) {
val content: String
get() = text.joinToString("") { it.text }
- val position: Coordinate
+ val startPosition: Coordinate
get() = if (text.isNotEmpty()) text.first().position else Coordinate(0, 0)
val first: String?
@@ -17,7 +17,10 @@ data class Line(val index: Int, val page: Int, val documentIndex: Int,
val second: String?
get() = if (text.count() > 1) text[1].text else null
- override fun toString() = "[$documentIndex -- $index, p.$page, $area, ${position.x}] --> '$content'"
+ override fun toString() = "[$documentIndex -- $index, p.$page, $area, ${startPosition.x}] --> '$content'"
- fun drop(numberOfItems: Int) = Line(index, page, documentIndex, text.drop(numberOfItems), area)
+ fun drop(numberOfItems: Int) = Line(index, page, documentIndex, text.drop(numberOfItems), area, Coordinate(0,0))
+ companion object{
+ private const val defaultPageWidth = 595.22
+ }
}
diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/rules/RuleSet.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/rules/RuleSet.kt
index ec5708da..5195f93b 100644
--- a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/rules/RuleSet.kt
+++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/rules/RuleSet.kt
@@ -4,7 +4,9 @@ import com.github.darderion.mundaneassignmentpolice.checker.rule.Rule
val RULE_SET_RU = RuleSet(
mutableListOf(
- RULE_LITLINK,
+ TABLE_RULE,
+
+ /*RULE_LITLINK,
RULE_SHORT_DASH,
RULE_MEDIUM_DASH,
RULE_LONG_DASH,
@@ -22,9 +24,12 @@ val RULE_SET_RU = RuleSet(
RULE_VARIOUS_ABBREVIATIONS,
RULE_SECTIONS_ORDER,
RULE_LOW_QUALITY_CONFERENCES,
+
+ */
)
- + RULES_SPACE_AROUND_BRACKETS
+ /*+ RULES_SPACE_AROUND_BRACKETS
+ RULES_SMALL_NUMBERS
-)
+ */
+)
class RuleSet(val rules: List) {}
diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/rules/Rules.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/rules/Rules.kt
index 346e4a12..568e2cb9 100644
--- a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/rules/Rules.kt
+++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/rules/Rules.kt
@@ -7,6 +7,7 @@ import com.github.darderion.mundaneassignmentpolice.checker.rule.regex.RegexRule
import com.github.darderion.mundaneassignmentpolice.checker.rule.symbol.SymbolRuleBuilder
import com.github.darderion.mundaneassignmentpolice.checker.rule.symbol.and
import com.github.darderion.mundaneassignmentpolice.checker.rule.symbol.or
+import com.github.darderion.mundaneassignmentpolice.checker.rule.table.TableRuleBuilder
import com.github.darderion.mundaneassignmentpolice.checker.rule.tableofcontent.TableOfContentRuleBuilder
import com.github.darderion.mundaneassignmentpolice.checker.rule.url.URLRuleBuilder
import com.github.darderion.mundaneassignmentpolice.checker.rule.url.then
@@ -15,6 +16,7 @@ import com.github.darderion.mundaneassignmentpolice.checker.rule.word.WordRuleBu
import com.github.darderion.mundaneassignmentpolice.checker.rule.word.or
import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFArea
import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFRegion
+import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.Line
import com.github.darderion.mundaneassignmentpolice.utils.InvalidOperationException
import com.github.darderion.mundaneassignmentpolice.utils.LowQualityConferencesUtil
import com.github.darderion.mundaneassignmentpolice.utils.ResourcesUtil
@@ -416,3 +418,12 @@ val RULE_LOW_QUALITY_CONFERENCES = URLRuleBuilder()
.any { conference -> url.text.contains(conference) }
}.map { it to it.lines }
}.getRule()
+
+val TABLE_RULE = TableRuleBuilder()
+ .called("Все клетки")
+ .disallow { table ->
+ val lines = mutableListOf()
+ table.cells.forEach { cell -> lines.addAll(cell.cellLines) }
+ lines
+ }
+ .getRule()
\ No newline at end of file
diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/wrapper/PDFBox.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/wrapper/PDFBox.kt
index f4ca5596..3ed05144 100644
--- a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/wrapper/PDFBox.kt
+++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/wrapper/PDFBox.kt
@@ -1,6 +1,7 @@
package com.github.darderion.mundaneassignmentpolice.wrapper
import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFDocument
+import com.github.darderion.mundaneassignmentpolice.pdfdocument.tables.Table
import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.*
import com.github.darderion.mundaneassignmentpolice.utils.imgToBase64String
import org.apache.pdfbox.pdmodel.PDDocument
@@ -11,9 +12,19 @@ import org.apache.pdfbox.pdmodel.font.PDType1Font
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject
import org.apache.pdfbox.text.PDFTextStripper
+import org.jetbrains.kotlinx.dataframe.DataFrame
+import org.jetbrains.kotlinx.dataframe.io.read
import java.awt.Color
import java.awt.image.RenderedImage
import java.io.*
+import java.nio.file.Files
+import java.nio.file.LinkOption
+import java.util.*
+import java.util.concurrent.TimeUnit
+import kotlin.collections.ArrayList
+import kotlin.collections.HashMap
+import kotlin.collections.LinkedHashSet
+import kotlin.io.path.Path
class PDFBox {
@@ -90,6 +101,8 @@ class PDFBox {
* @return PDFDocument
*/
fun getPDF(fileName: String): PDFDocument {
+ val tables = getTables(fileName)
+
val pdfText: MutableList = mutableListOf()
val document = getDocument(fileName)
@@ -117,7 +130,7 @@ class PDFBox {
var font: Font?
var word: String
var symb: Symbol
- val words: MutableList = mutableListOf()
+ var words: MutableList = mutableListOf()
var contentIndex: Int
var contentItem: String
var coordinates = Coordinate(0, 0)
@@ -166,13 +179,51 @@ class PDFBox {
if (font == null && word.isEmpty()) font = Font(0.0f)
words.add(Word(word, font!!, coordinates))
- Line(line, pageIndex, lineIndex, words.toList())
- })
- }
+ tables.filter { table -> table.page == pageIndex }.forEach { table ->
+ words = words.filter { word -> !isWordInTable(pageIndex, word, table) }
+ .filter { it.text.isNotEmpty() }.toMutableList()
+ }
+
+ if (document.pages[pageIndex].resources.xObjectNames.count() != 0){
+ Line(line, pageIndex, lineIndex, words.toList(),null,Coordinate(0,0))
+ }
+ else{
+ Line(line, pageIndex, lineIndex, words.toList(),null,stripper.symbols[stripperIndex-1].position)}
+ }
+ )
+ var line = text.lines().size
+ tables.forEach { table ->
+ if (table.page == pageIndex)
+ table.cells.forEach { cell ->
+ val cellLines = mutableListOf()
+ cellLines.addAll(cell.cellText.filter { it.isNotEmpty() }.map { content ->
+ words.clear()
+ content.split(" ").forEach {
+ words.add(Word(it, Font(12f), cell.leftCorner))
+ }
+ lineIndex += 1
+ line += 1
+ val tableLine = Line(line, pageIndex, lineIndex, words.toList(),
+ endPosition = Coordinate(cell.rightCorner.x, cell.rightCorner.y))
+ cell.cellLines = cellLines
+ pdfText.add(tableLine)
+ tableLine
+ }
+
+ )
+ }
+ }
+ }
document.close()
- return PDFDocument(fileName, pdfText, size.width.toDouble(), size.height.toDouble())
+ return PDFDocument(fileName, pdfText, tables, size.width.toDouble(), size.height.toDouble())
+ }
+
+ private fun isWordInTable(page: Int, word: Word, table: Table): Boolean {
+ return page == table.page &&
+ word.position.x >= table.x1 && word.position.y <= table.y1 &&
+ word.position.x <= table.x2 && word.position.y >= table.y2
}
fun getPDFSize(fileName: String): Int {
@@ -209,4 +260,37 @@ class PDFBox {
}
return images
}
+
+ /**
+ * Returns tables from PDF
+ * @param path pdf's path
+ * @return list of Table
+ */
+ fun getTables(path: String): List{
+
+ val workingDirPath = System.getProperty("user.home") + "/map"
+ val fileName = path.replace("uploads/","")
+ val tables = mutableListOf()
+
+ if (!Files.exists(Path("$workingDirPath/uploads/tables/$fileName"), LinkOption.NOFOLLOW_LINKS)) {
+
+ ProcessBuilder(
+ "src/main/python/venv/bin/python3",
+ "src/main/python/TableExtractionScript.py",
+ "extraction", path
+ )
+ .directory(File(workingDirPath))
+ .redirectOutput(ProcessBuilder.Redirect.INHERIT)
+ .start()
+ .waitFor()
+ }
+
+ File("$workingDirPath/uploads/tables/$fileName/").walkBottomUp().filter { it.isFile }.forEach {
+ val df = DataFrame.read(it)
+ tables.add(Table(df))
+ }
+
+ return tables
+ }
+
}
diff --git a/src/main/python/TableExtractionScript.py b/src/main/python/TableExtractionScript.py
new file mode 100755
index 00000000..a42773de
--- /dev/null
+++ b/src/main/python/TableExtractionScript.py
@@ -0,0 +1,53 @@
+import PyPDF2
+from PyPDF2.errors import PdfReadError
+import src.main.python.camelot
+import pandas
+import os
+import sys
+from pathlib import Path
+sys.path.insert(0, '../src')
+
+
+def extraction(pdf_path):
+
+ os.chdir(os.path.expanduser("~/map/"))
+ file_name = Path(pdf_path).stem
+
+ try:
+ PyPDF2.PdfFileReader(open(pdf_path, 'rb'))
+ except PyPDF2.errors.PdfReadError:
+ print("invalid PDF file")
+ else:
+ if not os.path.isdir(f'uploads/tables/{file_name}'):
+ os.mkdir(f'uploads/tables/{file_name}')
+
+ tables = src.main.python.camelot.read_pdf(pdf_path, latice=True, pages='all', line_scale=30)
+
+ for k in range(len(tables)):
+ left_x, left_y, right_x, right_y = 596, 896, 0, 0
+ for i in range(len(tables[k].cells)):
+ for j in range(len(tables[k].cells[i])):
+ left_x = min(left_x, tables[k].cells[i][j].x1)
+ left_y = min(left_y, tables[k].cells[i][j].y1)
+ right_x = max(right_x, tables[k].cells[i][j].x2)
+ right_y = max(right_y, tables[k].cells[i][j].y2)
+ tables[k].df.at[i, j] = f'x1 = {tables[k].cells[i][j].x1} ' \
+ f'y1 = {tables[k].cells[i][j].y1} ' \
+ f'x2 = {tables[k].cells[i][j].x2} ' \
+ f'y2 = {tables[k].cells[i][j].y2} \n ' \
+ + tables[k].df.at[i, j]
+ tables[k].df = pandas.concat([pandas.DataFrame(['table data']), tables[k].df,
+ pandas.DataFrame(['table information',
+ 'page', tables[k].page,
+ 'table area', left_x, left_y, right_x, right_y,
+ 'rows', len(tables[k].rows),
+ 'columns', len(tables[k].cols)],
+ )],
+ ignore_index=True)
+ tables.export(f'uploads/tables/{file_name}/{file_name}.csv',
+ f='csv',
+ compress=False)
+
+
+if __name__ == '__main__':
+ globals()[sys.argv[1]](sys.argv[2])
diff --git a/src/main/python/camelot/__init__.py b/src/main/python/camelot/__init__.py
new file mode 100755
index 00000000..bc4beb62
--- /dev/null
+++ b/src/main/python/camelot/__init__.py
@@ -0,0 +1,21 @@
+# -*- coding: utf-8 -*-
+
+import logging
+
+from .__version__ import __version__
+from .io import read_pdf
+from .plotting import PlotMethods
+
+
+# set up logging
+logger = logging.getLogger("camelot")
+
+format_string = "%(asctime)s - %(levelname)s - %(message)s"
+formatter = logging.Formatter(format_string, datefmt="%Y-%m-%dT%H:%M:%S")
+handler = logging.StreamHandler()
+handler.setFormatter(formatter)
+
+logger.addHandler(handler)
+
+# instantiate plot method
+plot = PlotMethods()
diff --git a/src/main/python/camelot/__main__.py b/src/main/python/camelot/__main__.py
new file mode 100644
index 00000000..ac90c95f
--- /dev/null
+++ b/src/main/python/camelot/__main__.py
@@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+
+
+__all__ = ("main",)
+
+
+def main():
+ from src.main.python.camelot.cli import cli
+
+ cli()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/src/main/python/camelot/__version__.py b/src/main/python/camelot/__version__.py
new file mode 100644
index 00000000..72364b92
--- /dev/null
+++ b/src/main/python/camelot/__version__.py
@@ -0,0 +1,23 @@
+# -*- coding: utf-8 -*-
+
+VERSION = (0, 11, 0)
+PRERELEASE = None # alpha, beta or rc
+REVISION = None
+
+
+def generate_version(version, prerelease=None, revision=None):
+ version_parts = [".".join(map(str, version))]
+ if prerelease is not None:
+ version_parts.append(f"-{prerelease}")
+ if revision is not None:
+ version_parts.append(f".{revision}")
+ return "".join(version_parts)
+
+
+__title__ = "camelot-py"
+__description__ = "PDF Table Extraction for Humans."
+__url__ = "http://camelot-py.readthedocs.io/"
+__version__ = generate_version(VERSION, prerelease=PRERELEASE, revision=REVISION)
+__author__ = "Vinayak Mehta"
+__author_email__ = "vmehta94@gmail.com"
+__license__ = "MIT License"
diff --git a/src/main/python/camelot/backends/__init__.py b/src/main/python/camelot/backends/__init__.py
new file mode 100644
index 00000000..8d0b91e9
--- /dev/null
+++ b/src/main/python/camelot/backends/__init__.py
@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+
+from .image_conversion import ImageConversionBackend
diff --git a/src/main/python/camelot/backends/ghostscript_backend.py b/src/main/python/camelot/backends/ghostscript_backend.py
new file mode 100644
index 00000000..1de7da19
--- /dev/null
+++ b/src/main/python/camelot/backends/ghostscript_backend.py
@@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+
+import sys
+import ctypes
+from ctypes.util import find_library
+
+
+def installed_posix():
+ library = find_library("gs")
+ return library is not None
+
+
+def installed_windows():
+ library = find_library(
+ "".join(("gsdll", str(ctypes.sizeof(ctypes.c_voidp) * 8), ".dll"))
+ )
+ return library is not None
+
+
+class GhostscriptBackend(object):
+ def installed(self):
+ if sys.platform in ["linux", "darwin"]:
+ return installed_posix()
+ elif sys.platform == "win32":
+ return installed_windows()
+ else:
+ return installed_posix()
+
+ def convert(self, pdf_path, png_path, resolution=300):
+ if not self.installed():
+ raise OSError(
+ "Ghostscript is not installed. You can install it using the instructions"
+ " here: https://camelot-py.readthedocs.io/en/master/user/install-deps.html"
+ )
+
+ import ghostscript
+
+ gs_command = [
+ "gs",
+ "-q",
+ "-sDEVICE=png16m",
+ "-o",
+ png_path,
+ f"-r{resolution}",
+ pdf_path,
+ ]
+ ghostscript.Ghostscript(*gs_command)
diff --git a/src/main/python/camelot/backends/image_conversion.py b/src/main/python/camelot/backends/image_conversion.py
new file mode 100644
index 00000000..7d2c4d7a
--- /dev/null
+++ b/src/main/python/camelot/backends/image_conversion.py
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+
+from .poppler_backend import PopplerBackend
+from .ghostscript_backend import GhostscriptBackend
+
+BACKENDS = {"poppler": PopplerBackend, "ghostscript": GhostscriptBackend}
+
+
+class ImageConversionBackend(object):
+ def __init__(self, backend="poppler", use_fallback=True):
+ if backend not in BACKENDS.keys():
+ raise ValueError(f"Image conversion backend '{backend}' not supported")
+
+ self.backend = backend
+ self.use_fallback = use_fallback
+ self.fallbacks = list(filter(lambda x: x != backend, BACKENDS.keys()))
+
+ def convert(self, pdf_path, png_path):
+ try:
+ converter = BACKENDS[self.backend]()
+ converter.convert(pdf_path, png_path)
+ except Exception as e:
+ import sys
+
+ if self.use_fallback:
+ for fallback in self.fallbacks:
+ try:
+ converter = BACKENDS[fallback]()
+ converter.convert(pdf_path, png_path)
+ except Exception as e:
+ raise type(e)(
+ str(e) + f" with image conversion backend '{fallback}'"
+ ).with_traceback(sys.exc_info()[2])
+ continue
+ else:
+ break
+ else:
+ raise type(e)(
+ str(e) + f" with image conversion backend '{self.backend}'"
+ ).with_traceback(sys.exc_info()[2])
diff --git a/src/main/python/camelot/backends/poppler_backend.py b/src/main/python/camelot/backends/poppler_backend.py
new file mode 100644
index 00000000..41033729
--- /dev/null
+++ b/src/main/python/camelot/backends/poppler_backend.py
@@ -0,0 +1,22 @@
+# -*- coding: utf-8 -*-
+
+import shutil
+import subprocess
+
+
+class PopplerBackend(object):
+ def convert(self, pdf_path, png_path):
+ pdftopng_executable = shutil.which("pdftopng")
+ if pdftopng_executable is None:
+ raise OSError(
+ "pdftopng is not installed. You can install it using the 'pip install pdftopng' command."
+ )
+
+ pdftopng_command = [pdftopng_executable, pdf_path, png_path]
+
+ try:
+ subprocess.check_output(
+ " ".join(pdftopng_command), stderr=subprocess.STDOUT, shell=True
+ )
+ except subprocess.CalledProcessError as e:
+ raise ValueError(e.output)
diff --git a/src/main/python/camelot/cli.py b/src/main/python/camelot/cli.py
new file mode 100644
index 00000000..546a32d8
--- /dev/null
+++ b/src/main/python/camelot/cli.py
@@ -0,0 +1,304 @@
+# -*- coding: utf-8 -*-
+
+import logging
+
+import click
+
+try:
+ import matplotlib.pyplot as plt
+except ImportError:
+ _HAS_MPL = False
+else:
+ _HAS_MPL = True
+
+from . import __version__, read_pdf, plot
+
+
+logger = logging.getLogger("camelot")
+logger.setLevel(logging.INFO)
+
+
+class Config(object):
+ def __init__(self):
+ self.config = {}
+
+ def set_config(self, key, value):
+ self.config[key] = value
+
+
+pass_config = click.make_pass_decorator(Config)
+
+
+@click.group(name="camelot")
+@click.version_option(version=__version__)
+@click.option("-q", "--quiet", is_flag=False, help="Suppress logs and warnings.")
+@click.option(
+ "-p",
+ "--pages",
+ default="1",
+ help="Comma-separated page numbers." " Example: 1,3,4 or 1,4-end or all.",
+)
+@click.option("-pw", "--password", help="Password for decryption.")
+@click.option("-o", "--output", help="Output file path.")
+@click.option(
+ "-f",
+ "--format",
+ type=click.Choice(["csv", "excel", "html", "json", "markdown", "sqlite"]),
+ help="Output file format.",
+)
+@click.option("-z", "--zip", is_flag=True, help="Create ZIP archive.")
+@click.option(
+ "-split",
+ "--split_text",
+ is_flag=True,
+ help="Split text that spans across multiple cells.",
+)
+@click.option(
+ "-flag",
+ "--flag_size",
+ is_flag=True,
+ help="Flag text based on" " font size. Useful to detect super/subscripts.",
+)
+@click.option(
+ "-strip",
+ "--strip_text",
+ help="Characters that should be stripped from a string before"
+ " assigning it to a cell.",
+)
+@click.option(
+ "-M",
+ "--margins",
+ nargs=3,
+ default=(1.0, 0.5, 0.1),
+ help="PDFMiner char_margin, line_margin and word_margin.",
+)
+@click.pass_context
+def cli(ctx, *args, **kwargs):
+ """Camelot: PDF Table Extraction for Humans"""
+ ctx.obj = Config()
+ for key, value in kwargs.items():
+ ctx.obj.set_config(key, value)
+
+
+@cli.command("lattice")
+@click.option(
+ "-R",
+ "--table_regions",
+ default=[],
+ multiple=True,
+ help="Page regions to analyze. Example: x1,y1,x2,y2"
+ " where x1, y1 -> left-top and x2, y2 -> right-bottom.",
+)
+@click.option(
+ "-T",
+ "--table_areas",
+ default=[],
+ multiple=True,
+ help="Table areas to process. Example: x1,y1,x2,y2"
+ " where x1, y1 -> left-top and x2, y2 -> right-bottom.",
+)
+@click.option(
+ "-back", "--process_background", is_flag=True, help="Process background lines."
+)
+@click.option(
+ "-scale",
+ "--line_scale",
+ default=15,
+ help="Line size scaling factor. The larger the value,"
+ " the smaller the detected lines.",
+)
+@click.option(
+ "-copy",
+ "--copy_text",
+ default=[],
+ type=click.Choice(["h", "v"]),
+ multiple=True,
+ help="Direction in which text in a spanning cell" " will be copied over.",
+)
+@click.option(
+ "-shift",
+ "--shift_text",
+ default=["l", "t"],
+ type=click.Choice(["", "l", "r", "t", "b"]),
+ multiple=True,
+ help="Direction in which text in a spanning cell will flow.",
+)
+@click.option(
+ "-l",
+ "--line_tol",
+ default=2,
+ help="Tolerance parameter used to merge close vertical" " and horizontal lines.",
+)
+@click.option(
+ "-j",
+ "--joint_tol",
+ default=2,
+ help="Tolerance parameter used to decide whether"
+ " the detected lines and points lie close to each other.",
+)
+@click.option(
+ "-block",
+ "--threshold_blocksize",
+ default=15,
+ help="For adaptive thresholding, size of a pixel"
+ " neighborhood that is used to calculate a threshold value for"
+ " the pixel. Example: 3, 5, 7, and so on.",
+)
+@click.option(
+ "-const",
+ "--threshold_constant",
+ default=-2,
+ help="For adaptive thresholding, constant subtracted"
+ " from the mean or weighted mean. Normally, it is positive but"
+ " may be zero or negative as well.",
+)
+@click.option(
+ "-I",
+ "--iterations",
+ default=0,
+ help="Number of times for erosion/dilation will be applied.",
+)
+@click.option(
+ "-res",
+ "--resolution",
+ default=300,
+ help="Resolution used for PDF to PNG conversion.",
+)
+@click.option(
+ "-plot",
+ "--plot_type",
+ type=click.Choice(["text", "grid", "contour", "joint", "line"]),
+ help="Plot elements found on PDF page for visual debugging.",
+)
+@click.argument("filepath", type=click.Path(exists=True))
+@pass_config
+def lattice(c, *args, **kwargs):
+ """Use lines between text to parse the table."""
+ conf = c.config
+ pages = conf.pop("pages")
+ output = conf.pop("output")
+ f = conf.pop("format")
+ compress = conf.pop("zip")
+ quiet = conf.pop("quiet")
+ plot_type = kwargs.pop("plot_type")
+ filepath = kwargs.pop("filepath")
+ kwargs.update(conf)
+
+ table_regions = list(kwargs["table_regions"])
+ kwargs["table_regions"] = None if not table_regions else table_regions
+ table_areas = list(kwargs["table_areas"])
+ kwargs["table_areas"] = None if not table_areas else table_areas
+ copy_text = list(kwargs["copy_text"])
+ kwargs["copy_text"] = None if not copy_text else copy_text
+ kwargs["shift_text"] = list(kwargs["shift_text"])
+
+ if plot_type is not None:
+ if not _HAS_MPL:
+ raise ImportError("matplotlib is required for plotting.")
+ else:
+ if output is None:
+ raise click.UsageError("Please specify output file path using --output")
+ if f is None:
+ raise click.UsageError("Please specify output file format using --format")
+
+ tables = read_pdf(
+ filepath, pages=pages, flavor="lattice", suppress_stdout=quiet, **kwargs
+ )
+ click.echo(f"Found {tables.n} tables")
+ if plot_type is not None:
+ for table in tables:
+ plot(table, kind=plot_type)
+ plt.show()
+ else:
+ tables.export(output, f=f, compress=compress)
+
+
+@cli.command("stream")
+@click.option(
+ "-R",
+ "--table_regions",
+ default=[],
+ multiple=True,
+ help="Page regions to analyze. Example: x1,y1,x2,y2"
+ " where x1, y1 -> left-top and x2, y2 -> right-bottom.",
+)
+@click.option(
+ "-T",
+ "--table_areas",
+ default=[],
+ multiple=True,
+ help="Table areas to process. Example: x1,y1,x2,y2"
+ " where x1, y1 -> left-top and x2, y2 -> right-bottom.",
+)
+@click.option(
+ "-C",
+ "--columns",
+ default=[],
+ multiple=True,
+ help="X coordinates of column separators.",
+)
+@click.option(
+ "-e",
+ "--edge_tol",
+ default=50,
+ help="Tolerance parameter" " for extending textedges vertically.",
+)
+@click.option(
+ "-r",
+ "--row_tol",
+ default=2,
+ help="Tolerance parameter" " used to combine text vertically, to generate rows.",
+)
+@click.option(
+ "-c",
+ "--column_tol",
+ default=0,
+ help="Tolerance parameter"
+ " used to combine text horizontally, to generate columns.",
+)
+@click.option(
+ "-plot",
+ "--plot_type",
+ type=click.Choice(["text", "grid", "contour", "textedge"]),
+ help="Plot elements found on PDF page for visual debugging.",
+)
+@click.argument("filepath", type=click.Path(exists=True))
+@pass_config
+def stream(c, *args, **kwargs):
+ """Use spaces between text to parse the table."""
+ conf = c.config
+ pages = conf.pop("pages")
+ output = conf.pop("output")
+ f = conf.pop("format")
+ compress = conf.pop("zip")
+ quiet = conf.pop("quiet")
+ plot_type = kwargs.pop("plot_type")
+ filepath = kwargs.pop("filepath")
+ kwargs.update(conf)
+
+ table_regions = list(kwargs["table_regions"])
+ kwargs["table_regions"] = None if not table_regions else table_regions
+ table_areas = list(kwargs["table_areas"])
+ kwargs["table_areas"] = None if not table_areas else table_areas
+ columns = list(kwargs["columns"])
+ kwargs["columns"] = None if not columns else columns
+
+ if plot_type is not None:
+ if not _HAS_MPL:
+ raise ImportError("matplotlib is required for plotting.")
+ else:
+ if output is None:
+ raise click.UsageError("Please specify output file path using --output")
+ if f is None:
+ raise click.UsageError("Please specify output file format using --format")
+
+ tables = read_pdf(
+ filepath, pages=pages, flavor="stream", suppress_stdout=quiet, **kwargs
+ )
+ click.echo(f"Found {tables.n} tables")
+ if plot_type is not None:
+ for table in tables:
+ plot(table, kind=plot_type)
+ plt.show()
+ else:
+ tables.export(output, f=f, compress=compress)
diff --git a/src/main/python/camelot/core.py b/src/main/python/camelot/core.py
new file mode 100644
index 00000000..58a98efd
--- /dev/null
+++ b/src/main/python/camelot/core.py
@@ -0,0 +1,764 @@
+# -*- coding: utf-8 -*-
+
+import os
+import sqlite3
+import zipfile
+import tempfile
+from itertools import chain
+from operator import itemgetter
+
+import numpy as np
+import pandas as pd
+
+
+# minimum number of vertical textline intersections for a textedge
+# to be considered valid
+TEXTEDGE_REQUIRED_ELEMENTS = 4
+# padding added to table area on the left, right and bottom
+TABLE_AREA_PADDING = 10
+
+
+class TextEdge(object):
+ """Defines a text edge coordinates relative to a left-bottom
+ origin. (PDF coordinate space)
+
+ Parameters
+ ----------
+ x : float
+ x-coordinate of the text edge.
+ y0 : float
+ y-coordinate of bottommost point.
+ y1 : float
+ y-coordinate of topmost point.
+ align : string, optional (default: 'left')
+ {'left', 'right', 'middle'}
+
+ Attributes
+ ----------
+ intersections: int
+ Number of intersections with horizontal text rows.
+ is_valid: bool
+ A text edge is valid if it intersections with at least
+ TEXTEDGE_REQUIRED_ELEMENTS horizontal text rows.
+
+ """
+
+ def __init__(self, x, y0, y1, align="left"):
+ self.x = x
+ self.y0 = y0
+ self.y1 = y1
+ self.align = align
+ self.intersections = 0
+ self.is_valid = False
+
+ def __repr__(self):
+ x = round(self.x, 2)
+ y0 = round(self.y0, 2)
+ y1 = round(self.y1, 2)
+ return (
+ f""
+ )
+
+ def update_coords(self, x, y0, edge_tol=50):
+ """Updates the text edge's x and bottom y coordinates and sets
+ the is_valid attribute.
+ """
+ if np.isclose(self.y0, y0, atol=edge_tol):
+ self.x = (self.intersections * self.x + x) / float(self.intersections + 1)
+ self.y0 = y0
+ self.intersections += 1
+ # a textedge is valid only if it extends uninterrupted
+ # over a required number of textlines
+ if self.intersections > TEXTEDGE_REQUIRED_ELEMENTS:
+ self.is_valid = True
+
+
+class TextEdges(object):
+ """Defines a dict of left, right and middle text edges found on
+ the PDF page. The dict has three keys based on the alignments,
+ and each key's value is a list of camelot.core.TextEdge objects.
+ """
+
+ def __init__(self, edge_tol=50):
+ self.edge_tol = edge_tol
+ self._textedges = {"left": [], "right": [], "middle": []}
+
+ @staticmethod
+ def get_x_coord(textline, align):
+ """Returns the x coordinate of a text row based on the
+ specified alignment.
+ """
+ x_left = textline.x0
+ x_right = textline.x1
+ x_middle = x_left + (x_right - x_left) / 2.0
+ x_coord = {"left": x_left, "middle": x_middle, "right": x_right}
+ return x_coord[align]
+
+ def find(self, x_coord, align):
+ """Returns the index of an existing text edge using
+ the specified x coordinate and alignment.
+ """
+ for i, te in enumerate(self._textedges[align]):
+ if np.isclose(te.x, x_coord, atol=0.5):
+ return i
+ return None
+
+ def add(self, textline, align):
+ """Adds a new text edge to the current dict."""
+ x = self.get_x_coord(textline, align)
+ y0 = textline.y0
+ y1 = textline.y1
+ te = TextEdge(x, y0, y1, align=align)
+ self._textedges[align].append(te)
+
+ def update(self, textline):
+ """Updates an existing text edge in the current dict."""
+ for align in ["left", "right", "middle"]:
+ x_coord = self.get_x_coord(textline, align)
+ idx = self.find(x_coord, align)
+ if idx is None:
+ self.add(textline, align)
+ else:
+ self._textedges[align][idx].update_coords(
+ x_coord, textline.y0, edge_tol=self.edge_tol
+ )
+
+ def generate(self, textlines):
+ """Generates the text edges dict based on horizontal text
+ rows.
+ """
+ for tl in textlines:
+ if len(tl.get_text().strip()) > 1: # TODO: hacky
+ self.update(tl)
+
+ def get_relevant(self):
+ """Returns the list of relevant text edges (all share the same
+ alignment) based on which list intersects horizontal text rows
+ the most.
+ """
+ intersections_sum = {
+ "left": sum(
+ te.intersections for te in self._textedges["left"] if te.is_valid
+ ),
+ "right": sum(
+ te.intersections for te in self._textedges["right"] if te.is_valid
+ ),
+ "middle": sum(
+ te.intersections for te in self._textedges["middle"] if te.is_valid
+ ),
+ }
+
+ # TODO: naive
+ # get vertical textedges that intersect maximum number of
+ # times with horizontal textlines
+ relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0]
+ return self._textedges[relevant_align]
+
+ def get_table_areas(self, textlines, relevant_textedges):
+ """Returns a dict of interesting table areas on the PDF page
+ calculated using relevant text edges.
+ """
+
+ def pad(area, average_row_height):
+ x0 = area[0] - TABLE_AREA_PADDING
+ y0 = area[1] - TABLE_AREA_PADDING
+ x1 = area[2] + TABLE_AREA_PADDING
+ # add a constant since table headers can be relatively up
+ y1 = area[3] + average_row_height * 5
+ return (x0, y0, x1, y1)
+
+ # sort relevant textedges in reading order
+ relevant_textedges.sort(key=lambda te: (-te.y0, te.x))
+
+ table_areas = {}
+ for te in relevant_textedges:
+ if te.is_valid:
+ if not table_areas:
+ table_areas[(te.x, te.y0, te.x, te.y1)] = None
+ else:
+ found = None
+ for area in table_areas:
+ # check for overlap
+ if te.y1 >= area[1] and te.y0 <= area[3]:
+ found = area
+ break
+ if found is None:
+ table_areas[(te.x, te.y0, te.x, te.y1)] = None
+ else:
+ table_areas.pop(found)
+ updated_area = (
+ found[0],
+ min(te.y0, found[1]),
+ max(found[2], te.x),
+ max(found[3], te.y1),
+ )
+ table_areas[updated_area] = None
+
+ # extend table areas based on textlines that overlap
+ # vertically. it's possible that these textlines were
+ # eliminated during textedges generation since numbers and
+ # chars/words/sentences are often aligned differently.
+ # drawback: table areas that have paragraphs on their sides
+ # will include the paragraphs too.
+ sum_textline_height = 0
+ for tl in textlines:
+ sum_textline_height += tl.y1 - tl.y0
+ found = None
+ for area in table_areas:
+ # check for overlap
+ if tl.y0 >= area[1] and tl.y1 <= area[3]:
+ found = area
+ break
+ if found is not None:
+ table_areas.pop(found)
+ updated_area = (
+ min(tl.x0, found[0]),
+ min(tl.y0, found[1]),
+ max(found[2], tl.x1),
+ max(found[3], tl.y1),
+ )
+ table_areas[updated_area] = None
+ average_textline_height = sum_textline_height / float(len(textlines))
+
+ # add some padding to table areas
+ table_areas_padded = {}
+ for area in table_areas:
+ table_areas_padded[pad(area, average_textline_height)] = None
+
+ return table_areas_padded
+
+
+class Cell(object):
+ """Defines a cell in a table with coordinates relative to a
+ left-bottom origin. (PDF coordinate space)
+
+ Parameters
+ ----------
+ x1 : float
+ x-coordinate of left-bottom point.
+ y1 : float
+ y-coordinate of left-bottom point.
+ x2 : float
+ x-coordinate of right-top point.
+ y2 : float
+ y-coordinate of right-top point.
+
+ Attributes
+ ----------
+ lb : tuple
+ Tuple representing left-bottom coordinates.
+ lt : tuple
+ Tuple representing left-top coordinates.
+ rb : tuple
+ Tuple representing right-bottom coordinates.
+ rt : tuple
+ Tuple representing right-top coordinates.
+ left : bool
+ Whether or not cell is bounded on the left.
+ right : bool
+ Whether or not cell is bounded on the right.
+ top : bool
+ Whether or not cell is bounded on the top.
+ bottom : bool
+ Whether or not cell is bounded on the bottom.
+ hspan : bool
+ Whether or not cell spans horizontally.
+ vspan : bool
+ Whether or not cell spans vertically.
+ text : string
+ Text assigned to cell.
+
+ """
+
+ def __init__(self, x1, y1, x2, y2):
+ self.x1 = x1
+ self.y1 = y1
+ self.x2 = x2
+ self.y2 = y2
+ self.lb = (x1, y1)
+ self.lt = (x1, y2)
+ self.rb = (x2, y1)
+ self.rt = (x2, y2)
+ self.left = False
+ self.right = False
+ self.top = False
+ self.bottom = False
+ self.hspan = False
+ self.vspan = False
+ self._text = ""
+
+ def __repr__(self):
+ x1 = round(self.x1)
+ y1 = round(self.y1)
+ x2 = round(self.x2)
+ y2 = round(self.y2)
+ return f""
+
+ @property
+ def text(self):
+ return self._text
+
+ @text.setter
+ def text(self, t):
+ self._text = "".join([self._text, t])
+
+ @property
+ def bound(self):
+ """The number of sides on which the cell is bounded."""
+ return self.top + self.bottom + self.left + self.right
+
+
+class Table(object):
+ """Defines a table with coordinates relative to a left-bottom
+ origin. (PDF coordinate space)
+
+ Parameters
+ ----------
+ cols : list
+ List of tuples representing column x-coordinates in increasing
+ order.
+ rows : list
+ List of tuples representing row y-coordinates in decreasing
+ order.
+
+ Attributes
+ ----------
+ df : :class:`pandas.DataFrame`
+ shape : tuple
+ Shape of the table.
+ accuracy : float
+ Accuracy with which text was assigned to the cell.
+ whitespace : float
+ Percentage of whitespace in the table.
+ order : int
+ Table number on PDF page.
+ page : int
+ PDF page number.
+
+ """
+
+ def __init__(self, cols, rows):
+ self.cols = cols
+ self.rows = rows
+ self.cells = [[Cell(c[0], r[1], c[1], r[0]) for c in cols] for r in rows]
+ self.df = None
+ self.shape = (0, 0)
+ self.accuracy = 0
+ self.whitespace = 0
+ self.order = None
+ self.page = None
+
+ def __repr__(self):
+ return f"<{self.__class__.__name__} shape={self.shape}>"
+
+ def __lt__(self, other):
+ if self.page == other.page:
+ if self.order < other.order:
+ return True
+ if self.page < other.page:
+ return True
+
+ @property
+ def data(self):
+ """Returns two-dimensional list of strings in table."""
+ d = []
+ for row in self.cells:
+ d.append([cell.text.strip() for cell in row])
+ return d
+
+ @property
+ def parsing_report(self):
+ """Returns a parsing report with %accuracy, %whitespace,
+ table number on page and page number.
+ """
+ # pretty?
+ report = {
+ "accuracy": round(self.accuracy, 2),
+ "whitespace": round(self.whitespace, 2),
+ "order": self.order,
+ "page": self.page,
+ }
+ return report
+
+ def set_all_edges(self):
+ """Sets all table edges to True."""
+ for row in self.cells:
+ for cell in row:
+ cell.left = cell.right = cell.top = cell.bottom = True
+ return self
+
+ def set_edges(self, vertical, horizontal, joint_tol=2):
+ """Sets a cell's edges to True depending on whether the cell's
+ coordinates overlap with the line's coordinates within a
+ tolerance.
+
+ Parameters
+ ----------
+ vertical : list
+ List of detected vertical lines.
+ horizontal : list
+ List of detected horizontal lines.
+
+ """
+ for v in vertical:
+ # find closest x coord
+ # iterate over y coords and find closest start and end points
+ i = [
+ i
+ for i, t in enumerate(self.cols)
+ if np.isclose(v[0], t[0], atol=joint_tol)
+ ]
+ j = [
+ j
+ for j, t in enumerate(self.rows)
+ if np.isclose(v[3], t[0], atol=joint_tol)
+ ]
+ k = [
+ k
+ for k, t in enumerate(self.rows)
+ if np.isclose(v[1], t[0], atol=joint_tol)
+ ]
+ if not j:
+ continue
+ J = j[0]
+ if i == [0]: # only left edge
+ L = i[0]
+ if k:
+ K = k[0]
+ while J < K:
+ self.cells[J][L].left = True
+ J += 1
+ else:
+ K = len(self.rows)
+ while J < K:
+ self.cells[J][L].left = True
+ J += 1
+ elif i == []: # only right edge
+ L = len(self.cols) - 1
+ if k:
+ K = k[0]
+ while J < K:
+ self.cells[J][L].right = True
+ J += 1
+ else:
+ K = len(self.rows)
+ while J < K:
+ self.cells[J][L].right = True
+ J += 1
+ else: # both left and right edges
+ L = i[0]
+ if k:
+ K = k[0]
+ while J < K:
+ self.cells[J][L].left = True
+ self.cells[J][L - 1].right = True
+ J += 1
+ else:
+ K = len(self.rows)
+ while J < K:
+ self.cells[J][L].left = True
+ self.cells[J][L - 1].right = True
+ J += 1
+
+ for h in horizontal:
+ # find closest y coord
+ # iterate over x coords and find closest start and end points
+ i = [
+ i
+ for i, t in enumerate(self.rows)
+ if np.isclose(h[1], t[0], atol=joint_tol)
+ ]
+ j = [
+ j
+ for j, t in enumerate(self.cols)
+ if np.isclose(h[0], t[0], atol=joint_tol)
+ ]
+ k = [
+ k
+ for k, t in enumerate(self.cols)
+ if np.isclose(h[2], t[0], atol=joint_tol)
+ ]
+ if not j:
+ continue
+ J = j[0]
+ if i == [0]: # only top edge
+ L = i[0]
+ if k:
+ K = k[0]
+ while J < K:
+ self.cells[L][J].top = True
+ J += 1
+ else:
+ K = len(self.cols)
+ while J < K:
+ self.cells[L][J].top = True
+ J += 1
+ elif i == []: # only bottom edge
+ L = len(self.rows) - 1
+ if k:
+ K = k[0]
+ while J < K:
+ self.cells[L][J].bottom = True
+ J += 1
+ else:
+ K = len(self.cols)
+ while J < K:
+ self.cells[L][J].bottom = True
+ J += 1
+ else: # both top and bottom edges
+ L = i[0]
+ if k:
+ K = k[0]
+ while J < K:
+ self.cells[L][J].top = True
+ self.cells[L - 1][J].bottom = True
+ J += 1
+ else:
+ K = len(self.cols)
+ while J < K:
+ self.cells[L][J].top = True
+ self.cells[L - 1][J].bottom = True
+ J += 1
+
+ return self
+
+ def set_border(self):
+ """Sets table border edges to True."""
+ for r in range(len(self.rows)):
+ self.cells[r][0].left = True
+ self.cells[r][len(self.cols) - 1].right = True
+ for c in range(len(self.cols)):
+ self.cells[0][c].top = True
+ self.cells[len(self.rows) - 1][c].bottom = True
+ return self
+
+ def set_span(self):
+ """Sets a cell's hspan or vspan attribute to True depending
+ on whether the cell spans horizontally or vertically.
+ """
+ for row in self.cells:
+ for cell in row:
+ left = cell.left
+ right = cell.right
+ top = cell.top
+ bottom = cell.bottom
+ if cell.bound == 4:
+ continue
+ elif cell.bound == 3:
+ if not left and (right and top and bottom):
+ cell.hspan = True
+ elif not right and (left and top and bottom):
+ cell.hspan = True
+ elif not top and (left and right and bottom):
+ cell.vspan = True
+ elif not bottom and (left and right and top):
+ cell.vspan = True
+ elif cell.bound == 2:
+ if left and right and (not top and not bottom):
+ cell.vspan = True
+ elif top and bottom and (not left and not right):
+ cell.hspan = True
+ elif cell.bound in [0, 1]:
+ cell.vspan = True
+ cell.hspan = True
+ return self
+
+ def to_csv(self, path, **kwargs):
+ """Writes Table to a comma-separated values (csv) file.
+
+ For kwargs, check :meth:`pandas.DataFrame.to_csv`.
+
+ Parameters
+ ----------
+ path : str
+ Output filepath.
+
+ """
+ kw = {"encoding": "utf-8", "index": False, "header": False, "quoting": 1}
+ kw.update(kwargs)
+ self.df.to_csv(path, **kw)
+
+ def to_json(self, path, **kwargs):
+ """Writes Table to a JSON file.
+
+ For kwargs, check :meth:`pandas.DataFrame.to_json`.
+
+ Parameters
+ ----------
+ path : str
+ Output filepath.
+
+ """
+ kw = {"orient": "records"}
+ kw.update(kwargs)
+ json_string = self.df.to_json(**kw)
+ with open(path, "w") as f:
+ f.write(json_string)
+
+ def to_excel(self, path, **kwargs):
+ """Writes Table to an Excel file.
+
+ For kwargs, check :meth:`pandas.DataFrame.to_excel`.
+
+ Parameters
+ ----------
+ path : str
+ Output filepath.
+
+ """
+ kw = {
+ "sheet_name": f"page-{self.page}-table-{self.order}",
+ "encoding": "utf-8",
+ }
+ kw.update(kwargs)
+ writer = pd.ExcelWriter(path)
+ self.df.to_excel(writer, **kw)
+ writer.save()
+
+ def to_html(self, path, **kwargs):
+ """Writes Table to an HTML file.
+
+ For kwargs, check :meth:`pandas.DataFrame.to_html`.
+
+ Parameters
+ ----------
+ path : str
+ Output filepath.
+
+ """
+ html_string = self.df.to_html(**kwargs)
+ with open(path, "w", encoding="utf-8") as f:
+ f.write(html_string)
+
+ def to_markdown(self, path, **kwargs):
+ """Writes Table to a Markdown file.
+
+ For kwargs, check :meth:`pandas.DataFrame.to_markdown`.
+
+ Parameters
+ ----------
+ path : str
+ Output filepath.
+
+ """
+ md_string = self.df.to_markdown(**kwargs)
+ with open(path, "w", encoding="utf-8") as f:
+ f.write(md_string)
+
+ def to_sqlite(self, path, **kwargs):
+ """Writes Table to sqlite database.
+
+ For kwargs, check :meth:`pandas.DataFrame.to_sql`.
+
+ Parameters
+ ----------
+ path : str
+ Output filepath.
+
+ """
+ kw = {"if_exists": "replace", "index": False}
+ kw.update(kwargs)
+ conn = sqlite3.connect(path)
+ table_name = f"page-{self.page}-table-{self.order}"
+ self.df.to_sql(table_name, conn, **kw)
+ conn.commit()
+ conn.close()
+
+
+class TableList(object):
+ """Defines a list of camelot.core.Table objects. Each table can
+ be accessed using its index.
+
+ Attributes
+ ----------
+ n : int
+ Number of tables in the list.
+
+ """
+
+ def __init__(self, tables):
+ self._tables = tables
+
+ def __repr__(self):
+ return f"<{self.__class__.__name__} n={self.n}>"
+
+ def __len__(self):
+ return len(self._tables)
+
+ def __getitem__(self, idx):
+ return self._tables[idx]
+
+ @staticmethod
+ def _format_func(table, f):
+ return getattr(table, f"to_{f}")
+
+ @property
+ def n(self):
+ return len(self)
+
+ def _write_file(self, f=None, **kwargs):
+ dirname = kwargs.get("dirname")
+ root = kwargs.get("root")
+ ext = kwargs.get("ext")
+ for table in self._tables:
+ filename = f"{root}-page-{table.page}-table-{table.order}{ext}"
+ filepath = os.path.join(dirname, filename)
+ to_format = self._format_func(table, f)
+ to_format(filepath)
+
+ def _compress_dir(self, **kwargs):
+ path = kwargs.get("path")
+ dirname = kwargs.get("dirname")
+ root = kwargs.get("root")
+ ext = kwargs.get("ext")
+ zipname = os.path.join(os.path.dirname(path), root) + ".zip"
+ with zipfile.ZipFile(zipname, "w", allowZip64=True) as z:
+ for table in self._tables:
+ filename = f"{root}-page-{table.page}-table-{table.order}{ext}"
+ filepath = os.path.join(dirname, filename)
+ z.write(filepath, os.path.basename(filepath))
+
+ def export(self, path, f="csv", compress=False):
+ """Exports the list of tables to specified file format.
+
+ Parameters
+ ----------
+ path : str
+ Output filepath.
+ f : str
+ File format. Can be csv, excel, html, json, markdown or sqlite.
+ compress : bool
+ Whether or not to add files to a ZIP archive.
+
+ """
+ dirname = os.path.dirname(path)
+ basename = os.path.basename(path)
+ root, ext = os.path.splitext(basename)
+ if compress:
+ dirname = tempfile.mkdtemp()
+
+ kwargs = {"path": path, "dirname": dirname, "root": root, "ext": ext}
+
+ if f in ["csv", "html", "json", "markdown"]:
+ self._write_file(f=f, **kwargs)
+ if compress:
+ self._compress_dir(**kwargs)
+ elif f == "excel":
+ filepath = os.path.join(dirname, basename)
+ writer = pd.ExcelWriter(filepath)
+ for table in self._tables:
+ sheet_name = f"page-{table.page}-table-{table.order}"
+ table.df.to_excel(writer, sheet_name=sheet_name, encoding="utf-8")
+ writer.save()
+ if compress:
+ zipname = os.path.join(os.path.dirname(path), root) + ".zip"
+ with zipfile.ZipFile(zipname, "w", allowZip64=True) as z:
+ z.write(filepath, os.path.basename(filepath))
+ elif f == "sqlite":
+ filepath = os.path.join(dirname, basename)
+ for table in self._tables:
+ table.to_sqlite(filepath)
+ if compress:
+ zipname = os.path.join(os.path.dirname(path), root) + ".zip"
+ with zipfile.ZipFile(zipname, "w", allowZip64=True) as z:
+ z.write(filepath, os.path.basename(filepath))
diff --git a/src/main/python/camelot/handlers.py b/src/main/python/camelot/handlers.py
new file mode 100644
index 00000000..3feadb60
--- /dev/null
+++ b/src/main/python/camelot/handlers.py
@@ -0,0 +1,180 @@
+# -*- coding: utf-8 -*-
+
+import os
+import sys
+
+from pypdf import PdfReader, PdfWriter
+
+from .core import TableList
+from .parsers import Stream, Lattice
+from .utils import (
+ TemporaryDirectory,
+ get_page_layout,
+ get_text_objects,
+ get_rotation,
+ is_url,
+ download_url,
+)
+
+
+class PDFHandler(object):
+ """Handles all operations like temp directory creation, splitting
+ file into single page PDFs, parsing each PDF and then removing the
+ temp directory.
+
+ Parameters
+ ----------
+ filepath : str
+ Filepath or URL of the PDF file.
+ pages : str, optional (default: '1')
+ Comma-separated page numbers.
+ Example: '1,3,4' or '1,4-end' or 'all'.
+ password : str, optional (default: None)
+ Password for decryption.
+
+ """
+
+ def __init__(self, filepath, pages="1", password=None):
+ if is_url(filepath):
+ filepath = download_url(filepath)
+ self.filepath = filepath
+ #if not filepath.lower().endswith(".pdf"):
+ # raise NotImplementedError("File format not supported")
+
+ if password is None:
+ self.password = ""
+ else:
+ self.password = password
+ if sys.version_info[0] < 3:
+ self.password = self.password.encode("ascii")
+ self.pages = self._get_pages(pages)
+
+ def _get_pages(self, pages):
+ """Converts pages string to list of ints.
+
+ Parameters
+ ----------
+ filepath : str
+ Filepath or URL of the PDF file.
+ pages : str, optional (default: '1')
+ Comma-separated page numbers.
+ Example: '1,3,4' or '1,4-end' or 'all'.
+
+ Returns
+ -------
+ P : list
+ List of int page numbers.
+
+ """
+ page_numbers = []
+
+ if pages == "1":
+ page_numbers.append({"start": 1, "end": 1})
+ else:
+ with open(self.filepath, "rb") as f:
+ infile = PdfReader(f, strict=False)
+
+ if infile.is_encrypted:
+ infile.decrypt(self.password)
+
+ if pages == "all":
+ page_numbers.append({"start": 1, "end": len(infile.pages)})
+ else:
+ for r in pages.split(","):
+ if "-" in r:
+ a, b = r.split("-")
+ if b == "end":
+ b = len(infile.pages)
+ page_numbers.append({"start": int(a), "end": int(b)})
+ else:
+ page_numbers.append({"start": int(r), "end": int(r)})
+
+ P = []
+ for p in page_numbers:
+ P.extend(range(p["start"], p["end"] + 1))
+ return sorted(set(P))
+
+ def _save_page(self, filepath, page, temp):
+ """Saves specified page from PDF into a temporary directory.
+
+ Parameters
+ ----------
+ filepath : str
+ Filepath or URL of the PDF file.
+ page : int
+ Page number.
+ temp : str
+ Tmp directory.
+
+ """
+ with open(filepath, "rb") as fileobj:
+ infile = PdfReader(fileobj, strict=False)
+ if infile.is_encrypted:
+ infile.decrypt(self.password)
+ fpath = os.path.join(temp, f"page-{page}.pdf")
+ froot, fext = os.path.splitext(fpath)
+ p = infile.pages[page - 1]
+ outfile = PdfWriter()
+ outfile.add_page(p)
+ with open(fpath, "wb") as f:
+ outfile.write(f)
+ layout, dim = get_page_layout(fpath)
+ # fix rotated PDF
+ chars = get_text_objects(layout, ltype="char")
+ horizontal_text = get_text_objects(layout, ltype="horizontal_text")
+ vertical_text = get_text_objects(layout, ltype="vertical_text")
+ rotation = get_rotation(chars, horizontal_text, vertical_text)
+ if rotation != "":
+ fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
+ os.rename(fpath, fpath_new)
+ instream = open(fpath_new, "rb")
+ infile = PdfReader(instream, strict=False)
+ if infile.is_encrypted:
+ infile.decrypt(self.password)
+ outfile = PdfWriter()
+ p = infile.pages[0]
+ if rotation == "anticlockwise":
+ p.rotate(90)
+ elif rotation == "clockwise":
+ p.rotate(-90)
+ outfile.add_page(p)
+ with open(fpath, "wb") as f:
+ outfile.write(f)
+ instream.close()
+
+ def parse(
+ self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs
+ ):
+ """Extracts tables by calling parser.get_tables on all single
+ page PDFs.
+
+ Parameters
+ ----------
+ flavor : str (default: 'lattice')
+ The parsing method to use ('lattice' or 'stream').
+ Lattice is used by default.
+ suppress_stdout : str (default: False)
+ Suppress logs and warnings.
+ layout_kwargs : dict, optional (default: {})
+ A dict of `pdfminer.layout.LAParams `_ kwargs.
+ kwargs : dict
+ See camelot.read_pdf kwargs.
+
+ Returns
+ -------
+ tables : camelot.core.TableList
+ List of tables found in PDF.
+
+ """
+ tables = []
+ with TemporaryDirectory() as tempdir:
+ for p in self.pages:
+ self._save_page(self.filepath, p, tempdir)
+ pages = [os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages]
+ parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
+ for p in pages:
+ t = parser.extract_tables(
+ p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
+ )
+ tables.extend(t)
+ return TableList(sorted(tables))
diff --git a/src/main/python/camelot/image_processing.py b/src/main/python/camelot/image_processing.py
new file mode 100644
index 00000000..08aae1b5
--- /dev/null
+++ b/src/main/python/camelot/image_processing.py
@@ -0,0 +1,399 @@
+# -*- coding: utf-8 -*-
+
+import cv2
+import numpy as np
+
+def adaptive_threshold_with_img(img, process_background=False, blocksize=15, c=-2):
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+ if process_background:
+ threshold = cv2.adaptiveThreshold(
+ gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c
+ )
+ else:
+ threshold = cv2.adaptiveThreshold(
+ np.invert(gray),
+ 255,
+ cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+ cv2.THRESH_BINARY,
+ blocksize,
+ c,
+ )
+ return img, threshold
+
+def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
+ """Thresholds an image using OpenCV's adaptiveThreshold.
+
+ Parameters
+ ----------
+ imagename : string
+ Path to image file.
+ process_background : bool, optional (default: False)
+ Whether or not to process lines that are in background.
+ blocksize : int, optional (default: 15)
+ Size of a pixel neighborhood that is used to calculate a
+ threshold value for the pixel: 3, 5, 7, and so on.
+
+ For more information, refer `OpenCV's adaptiveThreshold `_.
+ c : int, optional (default: -2)
+ Constant subtracted from the mean or weighted mean.
+ Normally, it is positive but may be zero or negative as well.
+
+ For more information, refer `OpenCV's adaptiveThreshold `_.
+
+ Returns
+ -------
+ img : object
+ numpy.ndarray representing the original image.
+ threshold : object
+ numpy.ndarray representing the thresholded image.
+
+ """
+ img = cv2.imread(imagename)
+ img, threshold = adaptive_threshold_with_img(img, process_background, blocksize, c)
+ return img, threshold
+
+
+def find_lines(
+ threshold, regions=None, direction="horizontal", line_scale=15, iterations=0
+):
+ """Finds horizontal and vertical lines by applying morphological
+ transformations on an image.
+
+ Parameters
+ ----------
+ threshold : object
+ numpy.ndarray representing the thresholded image.
+ regions : list, optional (default: None)
+ List of page regions that may contain tables of the form x1,y1,x2,y2
+ where (x1, y1) -> left-top and (x2, y2) -> right-bottom
+ in image coordinate space.
+ direction : string, optional (default: 'horizontal')
+ Specifies whether to find vertical or horizontal lines.
+ line_scale : int, optional (default: 15)
+ Factor by which the page dimensions will be divided to get
+ smallest length of lines that should be detected.
+
+ The larger this value, smaller the detected lines. Making it
+ too large will lead to text being detected as lines.
+ iterations : int, optional (default: 0)
+ Number of times for erosion/dilation is applied.
+
+ For more information, refer `OpenCV's dilate `_.
+
+ Returns
+ -------
+ dmask : object
+ numpy.ndarray representing pixels where vertical/horizontal
+ lines lie.
+ lines : list
+ List of tuples representing vertical/horizontal lines with
+ coordinates relative to a left-top origin in
+ image coordinate space.
+
+ """
+ lines = []
+
+ if direction == "vertical":
+ size = threshold.shape[0] // line_scale
+ el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
+ elif direction == "horizontal":
+ size = threshold.shape[1] // line_scale
+ el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
+ elif direction is None:
+ raise ValueError("Specify direction as either 'vertical' or 'horizontal'")
+
+ if regions is not None:
+ region_mask = np.zeros(threshold.shape)
+ for region in regions:
+ x, y, w, h = region
+ region_mask[y : y + h, x : x + w] = 1
+ threshold = np.multiply(threshold, region_mask)
+
+ threshold = cv2.erode(threshold, el)
+ threshold = cv2.dilate(threshold, el)
+ dmask = cv2.dilate(threshold, el, iterations=iterations)
+
+ try:
+ _, contours, _ = cv2.findContours(
+ threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
+ )
+ except ValueError:
+ # for opencv backward compatibility
+ contours, _ = cv2.findContours(
+ threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
+ )
+
+ for c in contours:
+ x, y, w, h = cv2.boundingRect(c)
+ x1, x2 = x, x + w
+ y1, y2 = y, y + h
+ if direction == "vertical":
+ lines.append(((x1 + x2) // 2, y2, (x1 + x2) // 2, y1))
+ elif direction == "horizontal":
+ lines.append((x1, (y1 + y2) // 2, x2, (y1 + y2) // 2))
+
+ return dmask, lines
+
+
+def find_contours(vertical, horizontal):
+ """Finds table boundaries using OpenCV's findContours.
+
+ Parameters
+ ----------
+ vertical : object
+ numpy.ndarray representing pixels where vertical lines lie.
+ horizontal : object
+ numpy.ndarray representing pixels where horizontal lines lie.
+
+ Returns
+ -------
+ cont : list
+ List of tuples representing table boundaries. Each tuple is of
+ the form (x, y, w, h) where (x, y) -> left-top, w -> width and
+ h -> height in image coordinate space.
+
+ """
+ mask = vertical + horizontal
+
+ try:
+ __, contours, __ = cv2.findContours(
+ mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
+ )
+ except ValueError:
+ # for opencv backward compatibility
+ contours, __ = cv2.findContours(
+ mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
+ )
+ # sort in reverse based on contour area and use first 10 contours
+ contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
+
+ cont = []
+ for c in contours:
+ c_poly = cv2.approxPolyDP(c, 3, True)
+ x, y, w, h = cv2.boundingRect(c_poly)
+ cont.append((x, y, w, h))
+ return cont
+
+
+def find_joints(contours, vertical, horizontal):
+ """Finds joints/intersections present inside each table boundary.
+
+ Parameters
+ ----------
+ contours : list
+ List of tuples representing table boundaries. Each tuple is of
+ the form (x, y, w, h) where (x, y) -> left-top, w -> width and
+ h -> height in image coordinate space.
+ vertical : object
+ numpy.ndarray representing pixels where vertical lines lie.
+ horizontal : object
+ numpy.ndarray representing pixels where horizontal lines lie.
+
+ Returns
+ -------
+ tables : dict
+ Dict with table boundaries as keys and list of intersections
+ in that boundary as their value.
+ Keys are of the form (x1, y1, x2, y2) where (x1, y1) -> lb
+ and (x2, y2) -> rt in image coordinate space.
+
+ """
+ joints = np.multiply(vertical, horizontal)
+ tables = {}
+ for c in contours:
+ x, y, w, h = c
+ roi = joints[y : y + h, x : x + w]
+ try:
+ __, jc, __ = cv2.findContours(
+ roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE
+ )
+ except ValueError:
+ # for opencv backward compatibility
+ jc, __ = cv2.findContours(
+ roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE
+ )
+ if len(jc) <= 4: # remove contours with less than 4 joints
+ continue
+ joint_coords = []
+ for j in jc:
+ jx, jy, jw, jh = cv2.boundingRect(j)
+ c1, c2 = x + (2 * jx + jw) // 2, y + (2 * jy + jh) // 2
+ joint_coords.append((c1, c2))
+ tables[(x, y + h, x + w, y)] = joint_coords
+
+ return tables
+
+
+def intersectes(r1, r2):
+ """ Checking the intersection of two ribs.
+
+ :param r1: tuple
+ (x11, y11, x21, y21) where (x11, y11) -> start coordinates of r1
+ and (x21, y21) -> end coordinates of rib1.
+ :param r2: tuple
+ (x12, y12, x22, y22) where (x12, y12) -> start coordinates of r2
+ and (x22, y22) -> end coordinates of rib2.
+ :return: boolean
+ if ribs intersect True else False.
+ """
+ c_m = 10
+ x11, y11, x21, y21 = r1[0], r1[1], r1[2], r1[3]
+ x12, y12, x22, y22 = r2[0], r2[1], r2[2], r2[3]
+
+ if (x11 == x21 and x12 == x22) or (y11 == y21 and y12 == y22):
+ return False
+ elif x11 == x21 and y12 == y22:
+ return x11 + c_m >= x12 and x11 <= x22 + c_m \
+ and y11 + c_m >= y12 >= y21 - c_m
+ else:
+ return x12 + c_m >= x11 and x12 <= x21 + c_m \
+ and y12 + c_m >= y11 >= y22 - c_m
+
+
+def draw_v(image, h_lines):
+ """
+ Draws the vertical lines between given horisontal lines, corrects the image.
+
+ :param image: img : object
+ numpy.ndarray representing the image.
+ :param h_lines: list
+ List of tuples representing horizontal lines with coordinates.
+ :return: img : object
+ numpy.ndarray representing the new image.
+ """
+
+ if len(h_lines) > 0:
+
+ h_lines = sorted(h_lines, key=lambda x: (x[0], x[1]))
+
+ l_x, r_x = h_lines[0][0], h_lines[0][2]
+ u_y, d_y = h_lines[0][1], h_lines[0][1]
+
+ for i in range(len(h_lines)):
+
+ if l_x == h_lines[i][0] and i != len(h_lines) - 1:
+ r_x = max(r_x, h_lines[i][2])
+
+ elif l_x == h_lines[i][0]:
+ d_y = h_lines[i][3]
+ cv2.rectangle(image, pt1=(l_x, u_y), pt2=(r_x, d_y), color=(0, 0, 0), thickness=3)
+
+ else:
+ d_y = h_lines[i - 1][3]
+ cv2.rectangle(image, pt1=(l_x, u_y), pt2=(r_x, d_y), color=(0, 0, 0), thickness=3)
+ l_x, r_x = h_lines[i][0], h_lines[i][2]
+ u_y, d_y = h_lines[i][1], h_lines[i][3]
+
+
+ return image
+
+
+def draw_h(image, v_lines):
+ '''
+ Draws the horisontal lines between given vertical lines, corrects the image.
+
+ :param image: img : object
+ numpy.ndarray representing the image.
+ :param v_lines: list
+ List of tuples representing vertical lines with
+ coordinates.
+ :return: image : object
+ numpy.ndarray representing the new image.
+ '''
+ if (len(v_lines) > 0):
+ v_lines = sorted(v_lines, key=lambda x: (x[3], x[0]))
+
+ u_y, d_y = v_lines[0][3], v_lines[0][1]
+
+ for i in range(len(v_lines)):
+
+ if u_y == v_lines[i][3] and i != len(v_lines) - 1:
+ d_y = max(d_y, v_lines[i][1])
+
+ elif u_y == v_lines[i][3]:
+ d_y = max(d_y, v_lines[i][1])
+ cv2.rectangle(image, pt1=(50, u_y), pt2=(image.shape[1] - 50, d_y), color=(0, 0, 0), thickness=3)
+
+ else:
+ cv2.rectangle(image, pt1=(50, u_y), pt2=(image.shape[1] - 50, d_y), color=(0, 0, 0), thickness=3)
+ u_y, d_y = v_lines[i][3], v_lines[i][1]
+
+ return image
+
+def correct_lines(image, v_segments, h_segments):
+ '''
+
+ :param image: object
+ numpy.ndarray representing the image.
+ :param v_segments: list
+ List of tuples representing vertical lines with
+ coordinates.
+ :param h_segments: list
+ List of tuples representing horizontal lines with
+ coordinates.
+ :return: image : object
+ numpy.ndarray representing the new image.
+ '''
+
+ h_size, v_size = len(h_segments), len(v_segments)
+
+ if h_size > 1 and v_size == 0:
+ image = draw_v(image, h_segments)
+
+ elif h_size == 0 and v_size > 1:
+ image = draw_h(image, v_segments)
+
+ elif v_size >= 1 and h_size >= 1:
+
+ ribs = v_segments[:] + h_segments[:]
+ segments = [[ribs[i]][:] for i in range(len(ribs))]
+
+ for i in range(0, len(ribs) - 1):
+ for j in range(i+1, len(ribs)):
+ if intersectes(ribs[i],ribs[j]):
+ for sg1 in segments:
+ cur_sg = []
+ if ribs[i] in sg1:
+ cur_sg = sg1
+ break
+
+ for sg2 in segments:
+ del_sg = []
+ if ribs[j] in sg2 and cur_sg != sg2:
+ cur_sg += sg2[:]
+ del_sg = sg2
+ break
+ if del_sg in segments:
+ segments.remove(del_sg)
+
+
+ s_lines = []
+
+ for i in range(len(segments)):
+
+ min_x, min_y = segments[i][0][0], segments[i][0][3]
+ max_x, max_y = segments[i][0][2], segments[i][0][1]
+
+ if len(segments[i]) > 1:
+ for line in segments[i]:
+ min_x, min_y = min(min_x, line[0]),min(min_y, line[3])
+ max_x, max_y = max(max_x, line[2]), max(max_y,line[1])
+ cv2.rectangle(image, pt1=(min_x, min_y), pt2=(max_x, max_y), color=(0, 0, 0), thickness=3)
+ else:
+ s_lines += segments[i]
+
+ h_s_lines, v_s_lines = [], []
+
+ for line in s_lines:
+ v_s_lines.append(line) if line[0] == line[2] else h_s_lines.append(line)
+
+ image = draw_h(image, v_s_lines)
+ image = draw_v(image, h_s_lines)
+
+ '''cv2.imshow("Image", image)
+ cv2.waitKey(0)
+ cv2.destroyAllWindows()
+ '''
+ return image
+
diff --git a/src/main/python/camelot/io.py b/src/main/python/camelot/io.py
new file mode 100644
index 00000000..a27a7c66
--- /dev/null
+++ b/src/main/python/camelot/io.py
@@ -0,0 +1,119 @@
+# -*- coding: utf-8 -*-
+
+import warnings
+
+from .handlers import PDFHandler
+from .utils import validate_input, remove_extra
+
+
+def read_pdf(
+ filepath,
+ pages="1",
+ password=None,
+ flavor="lattice",
+ suppress_stdout=False,
+ layout_kwargs={},
+ **kwargs
+):
+ """Read PDF and return extracted tables.
+
+ Note: kwargs annotated with ^ can only be used with flavor='stream'
+ and kwargs annotated with * can only be used with flavor='lattice'.
+
+ Parameters
+ ----------
+ filepath : str
+ Filepath or URL of the PDF file.
+ pages : str, optional (default: '1')
+ Comma-separated page numbers.
+ Example: '1,3,4' or '1,4-end' or 'all'.
+ password : str, optional (default: None)
+ Password for decryption.
+ flavor : str (default: 'lattice')
+ The parsing method to use ('lattice' or 'stream').
+ Lattice is used by default.
+ suppress_stdout : bool, optional (default: True)
+ Print all logs and warnings.
+ layout_kwargs : dict, optional (default: {})
+ A dict of `pdfminer.layout.LAParams `_ kwargs.
+ table_areas : list, optional (default: None)
+ List of table area strings of the form x1,y1,x2,y2
+ where (x1, y1) -> left-top and (x2, y2) -> right-bottom
+ in PDF coordinate space.
+ columns^ : list, optional (default: None)
+ List of column x-coordinates strings where the coordinates
+ are comma-separated.
+ split_text : bool, optional (default: False)
+ Split text that spans across multiple cells.
+ flag_size : bool, optional (default: False)
+ Flag text based on font size. Useful to detect
+ super/subscripts. Adds around flagged text.
+ strip_text : str, optional (default: '')
+ Characters that should be stripped from a string before
+ assigning it to a cell.
+ row_tol^ : int, optional (default: 2)
+ Tolerance parameter used to combine text vertically,
+ to generate rows.
+ column_tol^ : int, optional (default: 0)
+ Tolerance parameter used to combine text horizontally,
+ to generate columns.
+ process_background* : bool, optional (default: False)
+ Process background lines.
+ line_scale* : int, optional (default: 15)
+ Line size scaling factor. The larger the value the smaller
+ the detected lines. Making it very large will lead to text
+ being detected as lines.
+ copy_text* : list, optional (default: None)
+ {'h', 'v'}
+ Direction in which text in a spanning cell will be copied
+ over.
+ shift_text* : list, optional (default: ['l', 't'])
+ {'l', 'r', 't', 'b'}
+ Direction in which text in a spanning cell will flow.
+ line_tol* : int, optional (default: 2)
+ Tolerance parameter used to merge close vertical and horizontal
+ lines.
+ joint_tol* : int, optional (default: 2)
+ Tolerance parameter used to decide whether the detected lines
+ and points lie close to each other.
+ threshold_blocksize* : int, optional (default: 15)
+ Size of a pixel neighborhood that is used to calculate a
+ threshold value for the pixel: 3, 5, 7, and so on.
+
+ For more information, refer `OpenCV's adaptiveThreshold `_.
+ threshold_constant* : int, optional (default: -2)
+ Constant subtracted from the mean or weighted mean.
+ Normally, it is positive but may be zero or negative as well.
+
+ For more information, refer `OpenCV's adaptiveThreshold `_.
+ iterations* : int, optional (default: 0)
+ Number of times for erosion/dilation is applied.
+
+ For more information, refer `OpenCV's dilate `_.
+ resolution* : int, optional (default: 300)
+ Resolution used for PDF to PNG conversion.
+
+ Returns
+ -------
+ tables : camelot.core.TableList
+
+ """
+ if flavor not in ["lattice", "stream"]:
+ raise NotImplementedError(
+ "Unknown flavor specified." " Use either 'lattice' or 'stream'"
+ )
+
+ with warnings.catch_warnings():
+ if suppress_stdout:
+ warnings.simplefilter("ignore")
+
+ validate_input(kwargs, flavor=flavor)
+ p = PDFHandler(filepath, pages=pages, password=password)
+ kwargs = remove_extra(kwargs, flavor=flavor)
+ tables = p.parse(
+ flavor=flavor,
+ suppress_stdout=suppress_stdout,
+ layout_kwargs=layout_kwargs,
+ **kwargs
+ )
+ return tables
diff --git a/src/main/python/camelot/parsers/__init__.py b/src/main/python/camelot/parsers/__init__.py
new file mode 100644
index 00000000..5cc66051
--- /dev/null
+++ b/src/main/python/camelot/parsers/__init__.py
@@ -0,0 +1,4 @@
+# -*- coding: utf-8 -*-
+
+from .stream import Stream
+from .lattice import Lattice
diff --git a/src/main/python/camelot/parsers/base.py b/src/main/python/camelot/parsers/base.py
new file mode 100644
index 00000000..aeba056f
--- /dev/null
+++ b/src/main/python/camelot/parsers/base.py
@@ -0,0 +1,20 @@
+# -*- coding: utf-8 -*-
+
+import os
+
+from ..utils import get_page_layout, get_text_objects
+
+
+class BaseParser(object):
+ """Defines a base parser."""
+
+ def _generate_layout(self, filename, layout_kwargs):
+ self.filename = filename
+ self.layout_kwargs = layout_kwargs
+ self.layout, self.dimensions = get_page_layout(filename, **layout_kwargs)
+ self.images = get_text_objects(self.layout, ltype="image")
+ self.horizontal_text = get_text_objects(self.layout, ltype="horizontal_text")
+ self.vertical_text = get_text_objects(self.layout, ltype="vertical_text")
+ self.pdf_width, self.pdf_height = self.dimensions
+ self.rootname, __ = os.path.splitext(self.filename)
+ self.imagename = "".join([self.rootname, ".png"])
diff --git a/src/main/python/camelot/parsers/lattice.py b/src/main/python/camelot/parsers/lattice.py
new file mode 100644
index 00000000..5d8a79c8
--- /dev/null
+++ b/src/main/python/camelot/parsers/lattice.py
@@ -0,0 +1,464 @@
+# -*- coding: utf-8 -*-
+
+import os
+import sys
+import copy
+import locale
+import logging
+import warnings
+
+import numpy as np
+import pandas as pd
+
+from .base import BaseParser
+from ..core import Table
+from ..utils import (
+ scale_image,
+ scale_pdf,
+ segments_in_bbox,
+ text_in_bbox,
+ merge_close_lines,
+ get_table_index,
+ compute_accuracy,
+ compute_whitespace,
+)
+from ..image_processing import (
+ adaptive_threshold,
+ find_lines,
+ find_contours,
+ find_joints,
+ correct_lines,
+ adaptive_threshold_with_img,
+)
+from ..backends.image_conversion import BACKENDS
+
+
+logger = logging.getLogger("camelot")
+
+
+class Lattice(BaseParser):
+ """Lattice method of parsing looks for lines between text
+ to parse the table.
+
+ Parameters
+ ----------
+ table_regions : list, optional (default: None)
+ List of page regions that may contain tables of the form x1,y1,x2,y2
+ where (x1, y1) -> left-top and (x2, y2) -> right-bottom
+ in PDF coordinate space.
+ table_areas : list, optional (default: None)
+ List of table area strings of the form x1,y1,x2,y2
+ where (x1, y1) -> left-top and (x2, y2) -> right-bottom
+ in PDF coordinate space.
+ process_background : bool, optional (default: False)
+ Process background lines.
+ line_scale : int, optional (default: 15)
+ Line size scaling factor. The larger the value the smaller
+ the detected lines. Making it very large will lead to text
+ being detected as lines.
+ copy_text : list, optional (default: None)
+ {'h', 'v'}
+ Direction in which text in a spanning cell will be copied
+ over.
+ shift_text : list, optional (default: ['l', 't'])
+ {'l', 'r', 't', 'b'}
+ Direction in which text in a spanning cell will flow.
+ split_text : bool, optional (default: False)
+ Split text that spans across multiple cells.
+ flag_size : bool, optional (default: False)
+ Flag text based on font size. Useful to detect
+ super/subscripts. Adds around flagged text.
+ strip_text : str, optional (default: '')
+ Characters that should be stripped from a string before
+ assigning it to a cell.
+ line_tol : int, optional (default: 2)
+ Tolerance parameter used to merge close vertical and horizontal
+ lines.
+ joint_tol : int, optional (default: 2)
+ Tolerance parameter used to decide whether the detected lines
+ and points lie close to each other.
+ threshold_blocksize : int, optional (default: 15)
+ Size of a pixel neighborhood that is used to calculate a
+ threshold value for the pixel: 3, 5, 7, and so on.
+
+ For more information, refer `OpenCV's adaptiveThreshold `_.
+ threshold_constant : int, optional (default: -2)
+ Constant subtracted from the mean or weighted mean.
+ Normally, it is positive but may be zero or negative as well.
+
+ For more information, refer `OpenCV's adaptiveThreshold `_.
+ iterations : int, optional (default: 0)
+ Number of times for erosion/dilation is applied.
+
+ For more information, refer `OpenCV's dilate `_.
+ resolution : int, optional (default: 300)
+ Resolution used for PDF to PNG conversion.
+
+ """
+
+ def __init__(
+ self,
+ table_regions=None,
+ table_areas=None,
+ process_background=False,
+ line_scale=15,
+ copy_text=None,
+ shift_text=["l", "t"],
+ split_text=False,
+ flag_size=False,
+ strip_text="",
+ line_tol=2,
+ joint_tol=2,
+ threshold_blocksize=15,
+ threshold_constant=-2,
+ iterations=0,
+ resolution=300,
+ backend="ghostscript",
+ **kwargs,
+ ):
+ self.table_regions = table_regions
+ self.table_areas = table_areas
+ self.process_background = process_background
+ self.line_scale = line_scale
+ self.copy_text = copy_text
+ self.shift_text = shift_text
+ self.split_text = split_text
+ self.flag_size = flag_size
+ self.strip_text = strip_text
+ self.line_tol = line_tol
+ self.joint_tol = joint_tol
+ self.threshold_blocksize = threshold_blocksize
+ self.threshold_constant = threshold_constant
+ self.iterations = iterations
+ self.resolution = resolution
+ self.backend = Lattice._get_backend(backend)
+
+ @staticmethod
+ def _get_backend(backend):
+ def implements_convert():
+ methods = [
+ method for method in dir(backend) if method.startswith("__") is False
+ ]
+ return "convert" in methods
+
+ if isinstance(backend, str):
+ if backend not in BACKENDS.keys():
+ raise NotImplementedError(
+ f"Unknown backend '{backend}' specified. Please use either 'poppler' or 'ghostscript'."
+ )
+
+ if backend == "ghostscript":
+ warnings.warn(
+ "'ghostscript' will be replaced by 'poppler' as the default image conversion"
+ " backend in v0.12.0. You can try out 'poppler' with backend='poppler'.",
+ DeprecationWarning,
+ )
+
+ return BACKENDS[backend]()
+ else:
+ if not implements_convert():
+ raise NotImplementedError(
+ f"'{backend}' must implement a 'convert' method"
+ )
+
+ return backend
+
+ @staticmethod
+ def _reduce_index(t, idx, shift_text):
+ """Reduces index of a text object if it lies within a spanning
+ cell.
+
+ Parameters
+ ----------
+ table : camelot.core.Table
+ idx : list
+ List of tuples of the form (r_idx, c_idx, text).
+ shift_text : list
+ {'l', 'r', 't', 'b'}
+ Select one or more strings from above and pass them as a
+ list to specify where the text in a spanning cell should
+ flow.
+
+ Returns
+ -------
+ indices : list
+ List of tuples of the form (r_idx, c_idx, text) where
+ r_idx and c_idx are new row and column indices for text.
+
+ """
+ indices = []
+ for r_idx, c_idx, text in idx:
+ for d in shift_text:
+ if d == "l":
+ if t.cells[r_idx][c_idx].hspan:
+ while not t.cells[r_idx][c_idx].left:
+ c_idx -= 1
+ if d == "r":
+ if t.cells[r_idx][c_idx].hspan:
+ while not t.cells[r_idx][c_idx].right:
+ c_idx += 1
+ if d == "t":
+ if t.cells[r_idx][c_idx].vspan:
+ while not t.cells[r_idx][c_idx].top:
+ r_idx -= 1
+ if d == "b":
+ if t.cells[r_idx][c_idx].vspan:
+ while not t.cells[r_idx][c_idx].bottom:
+ r_idx += 1
+ indices.append((r_idx, c_idx, text))
+ return indices
+
+ @staticmethod
+ def _copy_spanning_text(t, copy_text=None):
+ """Copies over text in empty spanning cells.
+
+ Parameters
+ ----------
+ t : camelot.core.Table
+ copy_text : list, optional (default: None)
+ {'h', 'v'}
+ Select one or more strings from above and pass them as a list
+ to specify the direction in which text should be copied over
+ when a cell spans multiple rows or columns.
+
+ Returns
+ -------
+ t : camelot.core.Table
+
+ """
+ for f in copy_text:
+ if f == "h":
+ for i in range(len(t.cells)):
+ for j in range(len(t.cells[i])):
+ if t.cells[i][j].text.strip() == "":
+ if t.cells[i][j].hspan and not t.cells[i][j].left:
+ t.cells[i][j].text = t.cells[i][j - 1].text
+ elif f == "v":
+ for i in range(len(t.cells)):
+ for j in range(len(t.cells[i])):
+ if t.cells[i][j].text.strip() == "":
+ if t.cells[i][j].vspan and not t.cells[i][j].top:
+ t.cells[i][j].text = t.cells[i - 1][j].text
+ return t
+
+ def _generate_table_bbox(self):
+ def scale_areas(areas):
+ scaled_areas = []
+ for area in areas:
+ x1, y1, x2, y2 = area.split(",")
+ x1 = float(x1)
+ y1 = float(y1)
+ x2 = float(x2)
+ y2 = float(y2)
+ x1, y1, x2, y2 = scale_pdf((x1, y1, x2, y2), image_scalers)
+ scaled_areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
+ return scaled_areas
+
+ self.image, self.threshold = adaptive_threshold(
+ self.imagename,
+ process_background=self.process_background,
+ blocksize=self.threshold_blocksize,
+ c=self.threshold_constant,
+ )
+
+ image_width = self.image.shape[1]
+ image_height = self.image.shape[0]
+ image_width_scaler = image_width / float(self.pdf_width)
+ image_height_scaler = image_height / float(self.pdf_height)
+ pdf_width_scaler = self.pdf_width / float(image_width)
+ pdf_height_scaler = self.pdf_height / float(image_height)
+ image_scalers = (image_width_scaler, image_height_scaler, self.pdf_height)
+ pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height)
+
+ if self.table_areas is None:
+ regions = None
+ if self.table_regions is not None:
+ regions = scale_areas(self.table_regions)
+
+ vertical_mask, vertical_segments = find_lines(
+ self.threshold,
+ regions=regions,
+ direction="vertical",
+ line_scale=self.line_scale,
+ iterations=self.iterations,
+ )
+ horizontal_mask, horizontal_segments = find_lines(
+ self.threshold,
+ regions=regions,
+ direction="horizontal",
+ line_scale=self.line_scale,
+ iterations=self.iterations,
+ )
+
+ self.image = correct_lines(
+ self.image,
+ vertical_segments,
+ horizontal_segments
+ )
+ self.image, threshold = adaptive_threshold_with_img(
+ self.image,
+ process_background=self.process_background,
+ blocksize=self.threshold_blocksize,
+ c=self.threshold_constant
+ )
+
+ vertical_mask, vertical_segments = find_lines(
+ threshold,
+ regions=regions,
+ direction="vertical",
+ line_scale=self.line_scale,
+ iterations=self.iterations,
+ )
+ horizontal_mask, horizontal_segments = find_lines(
+ threshold,
+ regions=regions,
+ direction="horizontal",
+ line_scale=self.line_scale,
+ iterations=self.iterations,
+ )
+
+ contours = find_contours(vertical_mask, horizontal_mask)
+ table_bbox = find_joints(contours, vertical_mask, horizontal_mask)
+ else:
+ vertical_mask, vertical_segments = find_lines(
+ self.threshold,
+ direction="vertical",
+ line_scale=self.line_scale,
+ iterations=self.iterations,
+ )
+ horizontal_mask, horizontal_segments = find_lines(
+ self.threshold,
+ direction="horizontal",
+ line_scale=self.line_scale,
+ iterations=self.iterations,
+ )
+
+ areas = scale_areas(self.table_areas)
+ table_bbox = find_joints(areas, vertical_mask, horizontal_mask)
+
+ self.table_bbox_unscaled = copy.deepcopy(table_bbox)
+
+ self.table_bbox, self.vertical_segments, self.horizontal_segments = scale_image(
+ table_bbox, vertical_segments, horizontal_segments, pdf_scalers
+ )
+
+ def _generate_columns_and_rows(self, table_idx, tk):
+ # select elements which lie within table_bbox
+ t_bbox = {}
+ v_s, h_s = segments_in_bbox(
+ tk, self.vertical_segments, self.horizontal_segments
+ )
+ t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text)
+ t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text)
+
+ t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0))
+ t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0))
+
+ self.t_bbox = t_bbox
+
+ cols, rows = zip(*self.table_bbox[tk])
+ cols, rows = list(cols), list(rows)
+ cols.extend([tk[0], tk[2]])
+ rows.extend([tk[1], tk[3]])
+ # sort horizontal and vertical segments
+ cols = merge_close_lines(sorted(cols), line_tol=self.line_tol)
+ rows = merge_close_lines(sorted(rows, reverse=True), line_tol=self.line_tol)
+ # make grid using x and y coord of shortlisted rows and cols
+ cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
+ rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
+
+ return cols, rows, v_s, h_s
+
+ def _generate_table(self, table_idx, cols, rows, **kwargs):
+ v_s = kwargs.get("v_s")
+ h_s = kwargs.get("h_s")
+ if v_s is None or h_s is None:
+ raise ValueError("No segments found on {}".format(self.rootname))
+
+ table = Table(cols, rows)
+ # set table edges to True using ver+hor lines
+ table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol)
+ # set table border edges to True
+ table = table.set_border()
+ # set spanning cells to True
+ table = table.set_span()
+
+ pos_errors = []
+ # TODO: have a single list in place of two directional ones?
+ # sorted on x-coordinate based on reading order i.e. LTR or RTL
+ for direction in ["vertical", "horizontal"]:
+ for t in self.t_bbox[direction]:
+ indices, error = get_table_index(
+ table,
+ t,
+ direction,
+ split_text=self.split_text,
+ flag_size=self.flag_size,
+ strip_text=self.strip_text,
+ )
+ if indices[:2] != (-1, -1):
+ pos_errors.append(error)
+ indices = Lattice._reduce_index(
+ table, indices, shift_text=self.shift_text
+ )
+ for r_idx, c_idx, text in indices:
+ table.cells[r_idx][c_idx].text = text
+ accuracy = compute_accuracy([[100, pos_errors]])
+
+ if self.copy_text is not None:
+ table = Lattice._copy_spanning_text(table, copy_text=self.copy_text)
+
+ data = table.data
+ table.df = pd.DataFrame(data)
+ table.shape = table.df.shape
+
+ whitespace = compute_whitespace(data)
+ table.flavor = "lattice"
+ table.accuracy = accuracy
+ table.whitespace = whitespace
+ table.order = table_idx + 1
+ table.page = int(os.path.basename(self.rootname).replace("page-", ""))
+
+ # for plotting
+ _text = []
+ _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
+ _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
+ table._text = _text
+ table._image = (self.image, self.table_bbox_unscaled)
+ table._segments = (self.vertical_segments, self.horizontal_segments)
+ table._textedges = None
+
+ return table
+
+ def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
+ self._generate_layout(filename, layout_kwargs)
+ if not suppress_stdout:
+ logger.info("Processing {}".format(os.path.basename(self.rootname)))
+
+ if not self.horizontal_text:
+ if self.images:
+ warnings.warn(
+ "{} is image-based, camelot only works on"
+ " text-based pages.".format(os.path.basename(self.rootname))
+ )
+ else:
+ warnings.warn(
+ "No tables found on {}".format(os.path.basename(self.rootname))
+ )
+ return []
+
+ self.backend.convert(self.filename, self.imagename)
+
+ self._generate_table_bbox()
+
+ _tables = []
+ # sort tables based on y-coord
+ for table_idx, tk in enumerate(
+ sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
+ ):
+ cols, rows, v_s, h_s = self._generate_columns_and_rows(table_idx, tk)
+ table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s)
+ table._bbox = tk
+ _tables.append(table)
+
+ return _tables
diff --git a/src/main/python/camelot/parsers/stream.py b/src/main/python/camelot/parsers/stream.py
new file mode 100644
index 00000000..c7b21daf
--- /dev/null
+++ b/src/main/python/camelot/parsers/stream.py
@@ -0,0 +1,468 @@
+# -*- coding: utf-8 -*-
+
+import os
+import logging
+import warnings
+
+import numpy as np
+import pandas as pd
+
+from .base import BaseParser
+from ..core import TextEdges, Table
+from ..utils import text_in_bbox, get_table_index, compute_accuracy, compute_whitespace
+
+
+logger = logging.getLogger("camelot")
+
+
+class Stream(BaseParser):
+ """Stream method of parsing looks for spaces between text
+ to parse the table.
+
+ If you want to specify columns when specifying multiple table
+ areas, make sure that the length of both lists are equal.
+
+ Parameters
+ ----------
+ table_regions : list, optional (default: None)
+ List of page regions that may contain tables of the form x1,y1,x2,y2
+ where (x1, y1) -> left-top and (x2, y2) -> right-bottom
+ in PDF coordinate space.
+ table_areas : list, optional (default: None)
+ List of table area strings of the form x1,y1,x2,y2
+ where (x1, y1) -> left-top and (x2, y2) -> right-bottom
+ in PDF coordinate space.
+ columns : list, optional (default: None)
+ List of column x-coordinates strings where the coordinates
+ are comma-separated.
+ split_text : bool, optional (default: False)
+ Split text that spans across multiple cells.
+ flag_size : bool, optional (default: False)
+ Flag text based on font size. Useful to detect
+ super/subscripts. Adds around flagged text.
+ strip_text : str, optional (default: '')
+ Characters that should be stripped from a string before
+ assigning it to a cell.
+ edge_tol : int, optional (default: 50)
+ Tolerance parameter for extending textedges vertically.
+ row_tol : int, optional (default: 2)
+ Tolerance parameter used to combine text vertically,
+ to generate rows.
+ column_tol : int, optional (default: 0)
+ Tolerance parameter used to combine text horizontally,
+ to generate columns.
+
+ """
+
+ def __init__(
+ self,
+ table_regions=None,
+ table_areas=None,
+ columns=None,
+ split_text=False,
+ flag_size=False,
+ strip_text="",
+ edge_tol=50,
+ row_tol=2,
+ column_tol=0,
+ **kwargs,
+ ):
+ self.table_regions = table_regions
+ self.table_areas = table_areas
+ self.columns = columns
+ self._validate_columns()
+ self.split_text = split_text
+ self.flag_size = flag_size
+ self.strip_text = strip_text
+ self.edge_tol = edge_tol
+ self.row_tol = row_tol
+ self.column_tol = column_tol
+
+ @staticmethod
+ def _text_bbox(t_bbox):
+ """Returns bounding box for the text present on a page.
+
+ Parameters
+ ----------
+ t_bbox : dict
+ Dict with two keys 'horizontal' and 'vertical' with lists of
+ LTTextLineHorizontals and LTTextLineVerticals respectively.
+
+ Returns
+ -------
+ text_bbox : tuple
+ Tuple (x0, y0, x1, y1) in pdf coordinate space.
+
+ """
+ xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]])
+ ymin = min([t.y0 for direction in t_bbox for t in t_bbox[direction]])
+ xmax = max([t.x1 for direction in t_bbox for t in t_bbox[direction]])
+ ymax = max([t.y1 for direction in t_bbox for t in t_bbox[direction]])
+ text_bbox = (xmin, ymin, xmax, ymax)
+ return text_bbox
+
+ @staticmethod
+ def _group_rows(text, row_tol=2):
+ """Groups PDFMiner text objects into rows vertically
+ within a tolerance.
+
+ Parameters
+ ----------
+ text : list
+ List of PDFMiner text objects.
+ row_tol : int, optional (default: 2)
+
+ Returns
+ -------
+ rows : list
+ Two-dimensional list of text objects grouped into rows.
+
+ """
+ row_y = 0
+ rows = []
+ temp = []
+
+ for t in text:
+ # is checking for upright necessary?
+ # if t.get_text().strip() and all([obj.upright for obj in t._objs if
+ # type(obj) is LTChar]):
+ if t.get_text().strip():
+ if not np.isclose(row_y, t.y0, atol=row_tol):
+ rows.append(sorted(temp, key=lambda t: t.x0))
+ temp = []
+ row_y = t.y0
+ temp.append(t)
+
+ rows.append(sorted(temp, key=lambda t: t.x0))
+ if len(rows) > 1:
+ __ = rows.pop(0) # TODO: hacky
+ return rows
+
+ @staticmethod
+ def _merge_columns(l, column_tol=0):
+ """Merges column boundaries horizontally if they overlap
+ or lie within a tolerance.
+
+ Parameters
+ ----------
+ l : list
+ List of column x-coordinate tuples.
+ column_tol : int, optional (default: 0)
+
+ Returns
+ -------
+ merged : list
+ List of merged column x-coordinate tuples.
+
+ """
+ merged = []
+ for higher in l:
+ if not merged:
+ merged.append(higher)
+ else:
+ lower = merged[-1]
+ if column_tol >= 0:
+ if higher[0] <= lower[1] or np.isclose(
+ higher[0], lower[1], atol=column_tol
+ ):
+ upper_bound = max(lower[1], higher[1])
+ lower_bound = min(lower[0], higher[0])
+ merged[-1] = (lower_bound, upper_bound)
+ else:
+ merged.append(higher)
+ elif column_tol < 0:
+ if higher[0] <= lower[1]:
+ if np.isclose(higher[0], lower[1], atol=abs(column_tol)):
+ merged.append(higher)
+ else:
+ upper_bound = max(lower[1], higher[1])
+ lower_bound = min(lower[0], higher[0])
+ merged[-1] = (lower_bound, upper_bound)
+ else:
+ merged.append(higher)
+ return merged
+
+ @staticmethod
+ def _join_rows(rows_grouped, text_y_max, text_y_min):
+ """Makes row coordinates continuous.
+
+ Parameters
+ ----------
+ rows_grouped : list
+ Two-dimensional list of text objects grouped into rows.
+ text_y_max : int
+ text_y_min : int
+
+ Returns
+ -------
+ rows : list
+ List of continuous row y-coordinate tuples.
+
+ """
+ row_mids = [
+ sum([(t.y0 + t.y1) / 2 for t in r]) / len(r) if len(r) > 0 else 0
+ for r in rows_grouped
+ ]
+ rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
+ rows.insert(0, text_y_max)
+ rows.append(text_y_min)
+ rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
+ return rows
+
+ @staticmethod
+ def _add_columns(cols, text, row_tol):
+ """Adds columns to existing list by taking into account
+ the text that lies outside the current column x-coordinates.
+
+ Parameters
+ ----------
+ cols : list
+ List of column x-coordinate tuples.
+ text : list
+ List of PDFMiner text objects.
+ ytol : int
+
+ Returns
+ -------
+ cols : list
+ Updated list of column x-coordinate tuples.
+
+ """
+ if text:
+ text = Stream._group_rows(text, row_tol=row_tol)
+ elements = [len(r) for r in text]
+ new_cols = [
+ (t.x0, t.x1) for r in text if len(r) == max(elements) for t in r
+ ]
+ cols.extend(Stream._merge_columns(sorted(new_cols)))
+ return cols
+
+ @staticmethod
+ def _join_columns(cols, text_x_min, text_x_max):
+ """Makes column coordinates continuous.
+
+ Parameters
+ ----------
+ cols : list
+ List of column x-coordinate tuples.
+ text_x_min : int
+ text_y_max : int
+
+ Returns
+ -------
+ cols : list
+ Updated list of column x-coordinate tuples.
+
+ """
+ cols = sorted(cols)
+ cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
+ cols.insert(0, text_x_min)
+ cols.append(text_x_max)
+ cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
+ return cols
+
+ def _validate_columns(self):
+ if self.table_areas is not None and self.columns is not None:
+ if len(self.table_areas) != len(self.columns):
+ raise ValueError("Length of table_areas and columns" " should be equal")
+
+ def _nurminen_table_detection(self, textlines):
+ """A general implementation of the table detection algorithm
+ described by Anssi Nurminen's master's thesis.
+ Link: https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3
+
+ Assumes that tables are situated relatively far apart
+ vertically.
+ """
+ # TODO: add support for arabic text #141
+ # sort textlines in reading order
+ textlines.sort(key=lambda x: (-x.y0, x.x0))
+ textedges = TextEdges(edge_tol=self.edge_tol)
+ # generate left, middle and right textedges
+ textedges.generate(textlines)
+ # select relevant edges
+ relevant_textedges = textedges.get_relevant()
+ self.textedges.extend(relevant_textedges)
+ # guess table areas using textlines and relevant edges
+ table_bbox = textedges.get_table_areas(textlines, relevant_textedges)
+ # treat whole page as table area if no table areas found
+ if not len(table_bbox):
+ table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}
+
+ return table_bbox
+
+ def _generate_table_bbox(self):
+ self.textedges = []
+ if self.table_areas is None:
+ hor_text = self.horizontal_text
+ if self.table_regions is not None:
+ # filter horizontal text
+ hor_text = []
+ for region in self.table_regions:
+ x1, y1, x2, y2 = region.split(",")
+ x1 = float(x1)
+ y1 = float(y1)
+ x2 = float(x2)
+ y2 = float(y2)
+ region_text = text_in_bbox((x1, y2, x2, y1), self.horizontal_text)
+ hor_text.extend(region_text)
+ # find tables based on nurminen's detection algorithm
+ table_bbox = self._nurminen_table_detection(hor_text)
+ else:
+ table_bbox = {}
+ for area in self.table_areas:
+ x1, y1, x2, y2 = area.split(",")
+ x1 = float(x1)
+ y1 = float(y1)
+ x2 = float(x2)
+ y2 = float(y2)
+ table_bbox[(x1, y2, x2, y1)] = None
+ self.table_bbox = table_bbox
+
+ def _generate_columns_and_rows(self, table_idx, tk):
+ # select elements which lie within table_bbox
+ t_bbox = {}
+ t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text)
+ t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text)
+
+ t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0))
+ t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0))
+
+ self.t_bbox = t_bbox
+
+ text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(self.t_bbox)
+ rows_grouped = self._group_rows(self.t_bbox["horizontal"], row_tol=self.row_tol)
+ rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
+ elements = [len(r) for r in rows_grouped]
+
+ if self.columns is not None and self.columns[table_idx] != "":
+ # user has to input boundary columns too
+ # take (0, pdf_width) by default
+ # similar to else condition
+ # len can't be 1
+ cols = self.columns[table_idx].split(",")
+ cols = [float(c) for c in cols]
+ cols.insert(0, text_x_min)
+ cols.append(text_x_max)
+ cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
+ else:
+ # calculate mode of the list of number of elements in
+ # each row to guess the number of columns
+ if not len(elements):
+ cols = [(text_x_min, text_x_max)]
+ else:
+ ncols = max(set(elements), key=elements.count)
+ if ncols == 1:
+ # if mode is 1, the page usually contains not tables
+ # but there can be cases where the list can be skewed,
+ # try to remove all 1s from list in this case and
+ # see if the list contains elements, if yes, then use
+ # the mode after removing 1s
+ elements = list(filter(lambda x: x != 1, elements))
+ if len(elements):
+ ncols = max(set(elements), key=elements.count)
+ else:
+ warnings.warn(f"No tables found in table area {table_idx + 1}")
+ cols = [
+ (t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r
+ ]
+ cols = self._merge_columns(sorted(cols), column_tol=self.column_tol)
+ inner_text = []
+ for i in range(1, len(cols)):
+ left = cols[i - 1][1]
+ right = cols[i][0]
+ inner_text.extend(
+ [
+ t
+ for direction in self.t_bbox
+ for t in self.t_bbox[direction]
+ if t.x0 > left and t.x1 < right
+ ]
+ )
+ outer_text = [
+ t
+ for direction in self.t_bbox
+ for t in self.t_bbox[direction]
+ if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
+ ]
+ inner_text.extend(outer_text)
+ cols = self._add_columns(cols, inner_text, self.row_tol)
+ cols = self._join_columns(cols, text_x_min, text_x_max)
+
+ return cols, rows
+
+ def _generate_table(self, table_idx, cols, rows, **kwargs):
+ table = Table(cols, rows)
+ table = table.set_all_edges()
+
+ pos_errors = []
+ # TODO: have a single list in place of two directional ones?
+ # sorted on x-coordinate based on reading order i.e. LTR or RTL
+ for direction in ["vertical", "horizontal"]:
+ for t in self.t_bbox[direction]:
+ indices, error = get_table_index(
+ table,
+ t,
+ direction,
+ split_text=self.split_text,
+ flag_size=self.flag_size,
+ strip_text=self.strip_text,
+ )
+ if indices[:2] != (-1, -1):
+ pos_errors.append(error)
+ for r_idx, c_idx, text in indices:
+ table.cells[r_idx][c_idx].text = text
+ accuracy = compute_accuracy([[100, pos_errors]])
+
+ data = table.data
+ table.df = pd.DataFrame(data)
+ table.shape = table.df.shape
+
+ whitespace = compute_whitespace(data)
+ table.flavor = "stream"
+ table.accuracy = accuracy
+ table.whitespace = whitespace
+ table.order = table_idx + 1
+ table.page = int(os.path.basename(self.rootname).replace("page-", ""))
+
+ # for plotting
+ _text = []
+ _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
+ _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
+ table._text = _text
+ table._image = None
+ table._segments = None
+ table._textedges = self.textedges
+
+ return table
+
+ def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
+ self._generate_layout(filename, layout_kwargs)
+ base_filename = os.path.basename(self.rootname)
+
+ if not suppress_stdout:
+ logger.info(f"Processing {base_filename}")
+
+ if not self.horizontal_text:
+ if self.images:
+ warnings.warn(
+ f"{base_filename} is image-based, camelot only works on"
+ " text-based pages."
+ )
+ else:
+ warnings.warn(f"No tables found on {base_filename}")
+ return []
+
+ self._generate_table_bbox()
+
+ _tables = []
+ # sort tables based on y-coord
+ for table_idx, tk in enumerate(
+ sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
+ ):
+ cols, rows = self._generate_columns_and_rows(table_idx, tk)
+ table = self._generate_table(table_idx, cols, rows)
+ table._bbox = tk
+ _tables.append(table)
+
+ return _tables
diff --git a/src/main/python/camelot/plotting.py b/src/main/python/camelot/plotting.py
new file mode 100644
index 00000000..f5b6afe9
--- /dev/null
+++ b/src/main/python/camelot/plotting.py
@@ -0,0 +1,225 @@
+# -*- coding: utf-8 -*-
+
+try:
+ import matplotlib.pyplot as plt
+ import matplotlib.patches as patches
+except ImportError:
+ _HAS_MPL = False
+else:
+ _HAS_MPL = True
+
+
+class PlotMethods(object):
+ def __call__(self, table, kind="text", filename=None):
+ """Plot elements found on PDF page based on kind
+ specified, useful for debugging and playing with different
+ parameters to get the best output.
+
+ Parameters
+ ----------
+ table: camelot.core.Table
+ A Camelot Table.
+ kind : str, optional (default: 'text')
+ {'text', 'grid', 'contour', 'joint', 'line'}
+ The element type for which a plot should be generated.
+ filepath: str, optional (default: None)
+ Absolute path for saving the generated plot.
+
+ Returns
+ -------
+ fig : matplotlib.fig.Figure
+
+ """
+ if not _HAS_MPL:
+ raise ImportError("matplotlib is required for plotting.")
+
+ if table.flavor == "lattice" and kind in ["textedge"]:
+ raise NotImplementedError(f"Lattice flavor does not support kind='{kind}'")
+ elif table.flavor == "stream" and kind in ["joint", "line"]:
+ raise NotImplementedError(f"Stream flavor does not support kind='{kind}'")
+
+ plot_method = getattr(self, kind)
+ fig = plot_method(table)
+
+ if filename is not None:
+ fig.savefig(filename)
+ return None
+
+ return fig
+
+ def text(self, table):
+ """Generates a plot for all text elements present
+ on the PDF page.
+
+ Parameters
+ ----------
+ table : camelot.core.Table
+
+ Returns
+ -------
+ fig : matplotlib.fig.Figure
+
+ """
+ fig = plt.figure()
+ ax = fig.add_subplot(111, aspect="equal")
+ xs, ys = [], []
+ for t in table._text:
+ xs.extend([t[0], t[2]])
+ ys.extend([t[1], t[3]])
+ ax.add_patch(patches.Rectangle((t[0], t[1]), t[2] - t[0], t[3] - t[1]))
+ ax.set_xlim(min(xs) - 10, max(xs) + 10)
+ ax.set_ylim(min(ys) - 10, max(ys) + 10)
+ return fig
+
+ def grid(self, table):
+ """Generates a plot for the detected table grids
+ on the PDF page.
+
+ Parameters
+ ----------
+ table : camelot.core.Table
+
+ Returns
+ -------
+ fig : matplotlib.fig.Figure
+
+ """
+ fig = plt.figure()
+ ax = fig.add_subplot(111, aspect="equal")
+ for row in table.cells:
+ for cell in row:
+ if cell.left:
+ ax.plot([cell.lb[0], cell.lt[0]], [cell.lb[1], cell.lt[1]])
+ if cell.right:
+ ax.plot([cell.rb[0], cell.rt[0]], [cell.rb[1], cell.rt[1]])
+ if cell.top:
+ ax.plot([cell.lt[0], cell.rt[0]], [cell.lt[1], cell.rt[1]])
+ if cell.bottom:
+ ax.plot([cell.lb[0], cell.rb[0]], [cell.lb[1], cell.rb[1]])
+ return fig
+
+ def contour(self, table):
+ """Generates a plot for all table boundaries present
+ on the PDF page.
+
+ Parameters
+ ----------
+ table : camelot.core.Table
+
+ Returns
+ -------
+ fig : matplotlib.fig.Figure
+
+ """
+ try:
+ img, table_bbox = table._image
+ _FOR_LATTICE = True
+ except TypeError:
+ img, table_bbox = (None, {table._bbox: None})
+ _FOR_LATTICE = False
+ fig = plt.figure()
+ ax = fig.add_subplot(111, aspect="equal")
+
+ xs, ys = [], []
+ if not _FOR_LATTICE:
+ for t in table._text:
+ xs.extend([t[0], t[2]])
+ ys.extend([t[1], t[3]])
+ ax.add_patch(
+ patches.Rectangle(
+ (t[0], t[1]), t[2] - t[0], t[3] - t[1], color="blue"
+ )
+ )
+
+ for t in table_bbox.keys():
+ ax.add_patch(
+ patches.Rectangle(
+ (t[0], t[1]), t[2] - t[0], t[3] - t[1], fill=False, color="red"
+ )
+ )
+ if not _FOR_LATTICE:
+ xs.extend([t[0], t[2]])
+ ys.extend([t[1], t[3]])
+ ax.set_xlim(min(xs) - 10, max(xs) + 10)
+ ax.set_ylim(min(ys) - 10, max(ys) + 10)
+
+ if _FOR_LATTICE:
+ ax.imshow(img)
+ return fig
+
+ def textedge(self, table):
+ """Generates a plot for relevant textedges.
+
+ Parameters
+ ----------
+ table : camelot.core.Table
+
+ Returns
+ -------
+ fig : matplotlib.fig.Figure
+
+ """
+ fig = plt.figure()
+ ax = fig.add_subplot(111, aspect="equal")
+ xs, ys = [], []
+ for t in table._text:
+ xs.extend([t[0], t[2]])
+ ys.extend([t[1], t[3]])
+ ax.add_patch(
+ patches.Rectangle((t[0], t[1]), t[2] - t[0], t[3] - t[1], color="blue")
+ )
+ ax.set_xlim(min(xs) - 10, max(xs) + 10)
+ ax.set_ylim(min(ys) - 10, max(ys) + 10)
+
+ for te in table._textedges:
+ ax.plot([te.x, te.x], [te.y0, te.y1])
+
+ return fig
+
+ def joint(self, table):
+ """Generates a plot for all line intersections present
+ on the PDF page.
+
+ Parameters
+ ----------
+ table : camelot.core.Table
+
+ Returns
+ -------
+ fig : matplotlib.fig.Figure
+
+ """
+ img, table_bbox = table._image
+ fig = plt.figure()
+ ax = fig.add_subplot(111, aspect="equal")
+ x_coord = []
+ y_coord = []
+ for k in table_bbox.keys():
+ for coord in table_bbox[k]:
+ x_coord.append(coord[0])
+ y_coord.append(coord[1])
+ ax.plot(x_coord, y_coord, "ro")
+ ax.imshow(img)
+ return fig
+
+ def line(self, table):
+ """Generates a plot for all line segments present
+ on the PDF page.
+
+ Parameters
+ ----------
+ table : camelot.core.Table
+
+ Returns
+ -------
+ fig : matplotlib.fig.Figure
+
+ """
+ fig = plt.figure()
+ ax = fig.add_subplot(111, aspect="equal")
+ vertical, horizontal = table._segments
+ for v in vertical:
+ ax.plot([v[0], v[2]], [v[1], v[3]])
+ for h in horizontal:
+ ax.plot([h[0], h[2]], [h[1], h[3]])
+ return fig
diff --git a/src/main/python/camelot/utils.py b/src/main/python/camelot/utils.py
new file mode 100644
index 00000000..404c00b2
--- /dev/null
+++ b/src/main/python/camelot/utils.py
@@ -0,0 +1,938 @@
+# -*- coding: utf-8 -*-
+
+import os
+import re
+import random
+import shutil
+import string
+import tempfile
+import warnings
+from itertools import groupby
+from operator import itemgetter
+
+import numpy as np
+from pdfminer.pdfparser import PDFParser
+from pdfminer.pdfdocument import PDFDocument
+from pdfminer.pdfpage import PDFPage
+from pdfminer.pdfpage import PDFTextExtractionNotAllowed
+from pdfminer.pdfinterp import PDFResourceManager
+from pdfminer.pdfinterp import PDFPageInterpreter
+from pdfminer.converter import PDFPageAggregator
+from pdfminer.layout import (
+ LAParams,
+ LTAnno,
+ LTChar,
+ LTTextLineHorizontal,
+ LTTextLineVertical,
+ LTImage,
+)
+
+from urllib.request import Request, urlopen
+from urllib.parse import urlparse as parse_url
+from urllib.parse import uses_relative, uses_netloc, uses_params
+
+
+_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
+_VALID_URLS.discard("")
+
+
+# https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py
+def is_url(url):
+ """Check to see if a URL has a valid protocol.
+
+ Parameters
+ ----------
+ url : str or unicode
+
+ Returns
+ -------
+ isurl : bool
+ If url has a valid protocol return True otherwise False.
+
+ """
+ try:
+ return parse_url(url).scheme in _VALID_URLS
+ except Exception:
+ return False
+
+
+def random_string(length):
+ ret = ""
+ while length:
+ ret += random.choice(
+ string.digits + string.ascii_lowercase + string.ascii_uppercase
+ )
+ length -= 1
+ return ret
+
+
+def download_url(url):
+ """Download file from specified URL.
+
+ Parameters
+ ----------
+ url : str or unicode
+
+ Returns
+ -------
+ filepath : str or unicode
+ Temporary filepath.
+
+ """
+ filename = f"{random_string(6)}.pdf"
+ with tempfile.NamedTemporaryFile("wb", delete=False) as f:
+ headers = {"User-Agent": "Mozilla/5.0"}
+ request = Request(url, None, headers)
+ obj = urlopen(request)
+ content_type = obj.info().get_content_type()
+ if content_type != "application/pdf":
+ raise NotImplementedError("File format not supported")
+ f.write(obj.read())
+ filepath = os.path.join(os.path.dirname(f.name), filename)
+ shutil.move(f.name, filepath)
+ return filepath
+
+
+stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"]
+lattice_kwargs = [
+ "process_background",
+ "line_scale",
+ "copy_text",
+ "shift_text",
+ "line_tol",
+ "joint_tol",
+ "threshold_blocksize",
+ "threshold_constant",
+ "iterations",
+ "resolution",
+]
+
+
+def validate_input(kwargs, flavor="lattice"):
+ def check_intersection(parser_kwargs, input_kwargs):
+ isec = set(parser_kwargs).intersection(set(input_kwargs.keys()))
+ if isec:
+ raise ValueError(
+ f"{','.join(sorted(isec))} cannot be used with flavor='{flavor}'"
+ )
+
+ if flavor == "lattice":
+ check_intersection(stream_kwargs, kwargs)
+ else:
+ check_intersection(lattice_kwargs, kwargs)
+
+
+def remove_extra(kwargs, flavor="lattice"):
+ if flavor == "lattice":
+ for key in kwargs.keys():
+ if key in stream_kwargs:
+ kwargs.pop(key)
+ else:
+ for key in kwargs.keys():
+ if key in lattice_kwargs:
+ kwargs.pop(key)
+ return kwargs
+
+
+# https://stackoverflow.com/a/22726782
+class TemporaryDirectory(object):
+ def __enter__(self):
+ self.name = tempfile.mkdtemp()
+ return self.name
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ shutil.rmtree(self.name)
+
+
+def translate(x1, x2):
+ """Translates x2 by x1.
+
+ Parameters
+ ----------
+ x1 : float
+ x2 : float
+
+ Returns
+ -------
+ x2 : float
+
+ """
+ x2 += x1
+ return x2
+
+
+def scale(x, s):
+ """Scales x by scaling factor s.
+
+ Parameters
+ ----------
+ x : float
+ s : float
+
+ Returns
+ -------
+ x : float
+
+ """
+ x *= s
+ return x
+
+
+def scale_pdf(k, factors):
+ """Translates and scales pdf coordinate space to image
+ coordinate space.
+
+ Parameters
+ ----------
+ k : tuple
+ Tuple (x1, y1, x2, y2) representing table bounding box where
+ (x1, y1) -> lt and (x2, y2) -> rb in PDFMiner coordinate
+ space.
+ factors : tuple
+ Tuple (scaling_factor_x, scaling_factor_y, pdf_y) where the
+ first two elements are scaling factors and pdf_y is height of
+ pdf.
+
+ Returns
+ -------
+ knew : tuple
+ Tuple (x1, y1, x2, y2) representing table bounding box where
+ (x1, y1) -> lt and (x2, y2) -> rb in OpenCV coordinate
+ space.
+
+ """
+ x1, y1, x2, y2 = k
+ scaling_factor_x, scaling_factor_y, pdf_y = factors
+ x1 = scale(x1, scaling_factor_x)
+ y1 = scale(abs(translate(-pdf_y, y1)), scaling_factor_y)
+ x2 = scale(x2, scaling_factor_x)
+ y2 = scale(abs(translate(-pdf_y, y2)), scaling_factor_y)
+ knew = (int(x1), int(y1), int(x2), int(y2))
+ return knew
+
+
+def scale_image(tables, v_segments, h_segments, factors):
+ """Translates and scales image coordinate space to pdf
+ coordinate space.
+
+ Parameters
+ ----------
+ tables : dict
+ Dict with table boundaries as keys and list of intersections
+ in that boundary as value.
+ v_segments : list
+ List of vertical line segments.
+ h_segments : list
+ List of horizontal line segments.
+ factors : tuple
+ Tuple (scaling_factor_x, scaling_factor_y, img_y) where the
+ first two elements are scaling factors and img_y is height of
+ image.
+
+ Returns
+ -------
+ tables_new : dict
+ v_segments_new : dict
+ h_segments_new : dict
+
+ """
+ scaling_factor_x, scaling_factor_y, img_y = factors
+ tables_new = {}
+ for k in tables.keys():
+ x1, y1, x2, y2 = k
+ x1 = scale(x1, scaling_factor_x)
+ y1 = scale(abs(translate(-img_y, y1)), scaling_factor_y)
+ x2 = scale(x2, scaling_factor_x)
+ y2 = scale(abs(translate(-img_y, y2)), scaling_factor_y)
+ j_x, j_y = zip(*tables[k])
+ j_x = [scale(j, scaling_factor_x) for j in j_x]
+ j_y = [scale(abs(translate(-img_y, j)), scaling_factor_y) for j in j_y]
+ joints = zip(j_x, j_y)
+ tables_new[(x1, y1, x2, y2)] = joints
+
+ v_segments_new = []
+ for v in v_segments:
+ x1, x2 = scale(v[0], scaling_factor_x), scale(v[2], scaling_factor_x)
+ y1, y2 = (
+ scale(abs(translate(-img_y, v[1])), scaling_factor_y),
+ scale(abs(translate(-img_y, v[3])), scaling_factor_y),
+ )
+ v_segments_new.append((x1, y1, x2, y2))
+
+ h_segments_new = []
+ for h in h_segments:
+ x1, x2 = scale(h[0], scaling_factor_x), scale(h[2], scaling_factor_x)
+ y1, y2 = (
+ scale(abs(translate(-img_y, h[1])), scaling_factor_y),
+ scale(abs(translate(-img_y, h[3])), scaling_factor_y),
+ )
+ h_segments_new.append((x1, y1, x2, y2))
+
+ return tables_new, v_segments_new, h_segments_new
+
+
+def get_rotation(chars, horizontal_text, vertical_text):
+ """Detects if text in table is rotated or not using the current
+ transformation matrix (CTM) and returns its orientation.
+
+ Parameters
+ ----------
+ horizontal_text : list
+ List of PDFMiner LTTextLineHorizontal objects.
+ vertical_text : list
+ List of PDFMiner LTTextLineVertical objects.
+ ltchar : list
+ List of PDFMiner LTChar objects.
+
+ Returns
+ -------
+ rotation : string
+ '' if text in table is upright, 'anticlockwise' if
+ rotated 90 degree anticlockwise and 'clockwise' if
+ rotated 90 degree clockwise.
+
+ """
+ rotation = ""
+ hlen = len([t for t in horizontal_text if t.get_text().strip()])
+ vlen = len([t for t in vertical_text if t.get_text().strip()])
+ if hlen < vlen:
+ clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in chars)
+ anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in chars)
+ rotation = "anticlockwise" if clockwise < anticlockwise else "clockwise"
+ return rotation
+
+
+def segments_in_bbox(bbox, v_segments, h_segments):
+ """Returns all line segments present inside a bounding box.
+
+ Parameters
+ ----------
+ bbox : tuple
+ Tuple (x1, y1, x2, y2) representing a bounding box where
+ (x1, y1) -> lb and (x2, y2) -> rt in PDFMiner coordinate
+ space.
+ v_segments : list
+ List of vertical line segments.
+ h_segments : list
+ List of vertical horizontal segments.
+
+ Returns
+ -------
+ v_s : list
+ List of vertical line segments that lie inside table.
+ h_s : list
+ List of horizontal line segments that lie inside table.
+
+ """
+ lb = (bbox[0], bbox[1])
+ rt = (bbox[2], bbox[3])
+ v_s = [
+ v
+ for v in v_segments
+ if v[1] > lb[1] - 2 and v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2
+ ]
+ h_s = [
+ h
+ for h in h_segments
+ if h[0] > lb[0] - 2 and h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2
+ ]
+ return v_s, h_s
+
+
+def text_in_bbox(bbox, text):
+ """Returns all text objects present inside a bounding box.
+
+ Parameters
+ ----------
+ bbox : tuple
+ Tuple (x1, y1, x2, y2) representing a bounding box where
+ (x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
+ space.
+ text : List of PDFMiner text objects.
+
+ Returns
+ -------
+ t_bbox : list
+ List of PDFMiner text objects that lie inside table, discarding the overlapping ones
+
+ """
+ lb = (bbox[0], bbox[1])
+ rt = (bbox[2], bbox[3])
+ t_bbox = [
+ t
+ for t in text
+ if lb[0] - 2 <= (t.x0 + t.x1) / 2.0 <= rt[0] + 2
+ and lb[1] - 2 <= (t.y0 + t.y1) / 2.0 <= rt[1] + 2
+ ]
+
+ # Avoid duplicate text by discarding overlapping boxes
+ rest = {t for t in t_bbox}
+ for ba in t_bbox:
+ for bb in rest.copy():
+ if ba == bb:
+ continue
+ if bbox_intersect(ba, bb):
+ # if the intersection is larger than 80% of ba's size, we keep the longest
+ if (bbox_intersection_area(ba, bb) / bbox_area(ba)) > 0.8:
+ if bbox_longer(bb, ba):
+ rest.discard(ba)
+ unique_boxes = list(rest)
+
+ return unique_boxes
+
+
+def bbox_intersection_area(ba, bb) -> float:
+ """Returns area of the intersection of the bounding boxes of two PDFMiner objects.
+
+ Parameters
+ ----------
+ ba : PDFMiner text object
+ bb : PDFMiner text object
+
+ Returns
+ -------
+ intersection_area : float
+ Area of the intersection of the bounding boxes of both objects
+
+ """
+ x_left = max(ba.x0, bb.x0)
+ y_top = min(ba.y1, bb.y1)
+ x_right = min(ba.x1, bb.x1)
+ y_bottom = max(ba.y0, bb.y0)
+
+ if x_right < x_left or y_bottom > y_top:
+ return 0.0
+
+ intersection_area = (x_right - x_left) * (y_top - y_bottom)
+ return intersection_area
+
+
+def bbox_area(bb) -> float:
+ """Returns area of the bounding box of a PDFMiner object.
+
+ Parameters
+ ----------
+ bb : PDFMiner text object
+
+ Returns
+ -------
+ area : float
+ Area of the bounding box of the object
+
+ """
+ return (bb.x1 - bb.x0) * (bb.y1 - bb.y0)
+
+
+def bbox_intersect(ba, bb) -> bool:
+ """Returns True if the bounding boxes of two PDFMiner objects intersect.
+
+ Parameters
+ ----------
+ ba : PDFMiner text object
+ bb : PDFMiner text object
+
+ Returns
+ -------
+ overlaps : bool
+ True if the bounding boxes intersect
+
+ """
+ return ba.x1 >= bb.x0 and bb.x1 >= ba.x0 and ba.y1 >= bb.y0 and bb.y1 >= ba.y0
+
+
+def bbox_longer(ba, bb) -> bool:
+ """Returns True if the bounding box of the first PDFMiner object is longer or equal to the second.
+
+ Parameters
+ ----------
+ ba : PDFMiner text object
+ bb : PDFMiner text object
+
+ Returns
+ -------
+ longer : bool
+ True if the bounding box of the first object is longer or equal
+
+ """
+ return (ba.x1 - ba.x0) >= (bb.x1 - bb.x0)
+
+
+def merge_close_lines(ar, line_tol=2):
+ """Merges lines which are within a tolerance by calculating a
+ moving mean, based on their x or y axis projections.
+
+ Parameters
+ ----------
+ ar : list
+ line_tol : int, optional (default: 2)
+
+ Returns
+ -------
+ ret : list
+
+ """
+ ret = []
+ for a in ar:
+ if not ret:
+ ret.append(a)
+ else:
+ temp = ret[-1]
+ if np.isclose(temp, a, atol=line_tol):
+ temp = (temp + a) / 2.0
+ ret[-1] = temp
+ else:
+ ret.append(a)
+ return ret
+
+
+def text_strip(text, strip=""):
+ """Strips any characters in `strip` that are present in `text`.
+ Parameters
+ ----------
+ text : str
+ Text to process and strip.
+ strip : str, optional (default: '')
+ Characters that should be stripped from `text`.
+ Returns
+ -------
+ stripped : str
+ """
+ if not strip:
+ return text
+
+ stripped = re.sub(
+ fr"[{''.join(map(re.escape, strip))}]", "", text, flags=re.UNICODE
+ )
+ return stripped
+
+
+# TODO: combine the following functions into a TextProcessor class which
+# applies corresponding transformations sequentially
+# (inspired from sklearn.pipeline.Pipeline)
+
+
+def flag_font_size(textline, direction, strip_text=""):
+ """Flags super/subscripts in text by enclosing them with .
+ May give false positives.
+
+ Parameters
+ ----------
+ textline : list
+ List of PDFMiner LTChar objects.
+ direction : string
+ Direction of the PDFMiner LTTextLine object.
+ strip_text : str, optional (default: '')
+ Characters that should be stripped from a string before
+ assigning it to a cell.
+
+ Returns
+ -------
+ fstring : string
+
+ """
+ if direction == "horizontal":
+ d = [
+ (t.get_text(), np.round(t.height, decimals=6))
+ for t in textline
+ if not isinstance(t, LTAnno)
+ ]
+ elif direction == "vertical":
+ d = [
+ (t.get_text(), np.round(t.width, decimals=6))
+ for t in textline
+ if not isinstance(t, LTAnno)
+ ]
+ l = [np.round(size, decimals=6) for text, size in d]
+ if len(set(l)) > 1:
+ flist = []
+ min_size = min(l)
+ for key, chars in groupby(d, itemgetter(1)):
+ if key == min_size:
+ fchars = [t[0] for t in chars]
+ if "".join(fchars).strip():
+ fchars.insert(0, "")
+ fchars.append("")
+ flist.append("".join(fchars))
+ else:
+ fchars = [t[0] for t in chars]
+ if "".join(fchars).strip():
+ flist.append("".join(fchars))
+ fstring = "".join(flist)
+ else:
+ fstring = "".join([t.get_text() for t in textline])
+ return text_strip(fstring, strip_text)
+
+
+def split_textline(table, textline, direction, flag_size=False, strip_text=""):
+ """Splits PDFMiner LTTextLine into substrings if it spans across
+ multiple rows/columns.
+
+ Parameters
+ ----------
+ table : camelot.core.Table
+ textline : object
+ PDFMiner LTTextLine object.
+ direction : string
+ Direction of the PDFMiner LTTextLine object.
+ flag_size : bool, optional (default: False)
+ Whether or not to highlight a substring using
+ if its size is different from rest of the string. (Useful for
+ super and subscripts.)
+ strip_text : str, optional (default: '')
+ Characters that should be stripped from a string before
+ assigning it to a cell.
+
+ Returns
+ -------
+ grouped_chars : list
+ List of tuples of the form (idx, text) where idx is the index
+ of row/column and text is the an lttextline substring.
+
+ """
+ idx = 0
+ cut_text = []
+ bbox = textline.bbox
+ try:
+ if direction == "horizontal" and not textline.is_empty():
+ x_overlap = [
+ i
+ for i, x in enumerate(table.cols)
+ if x[0] <= bbox[2] and bbox[0] <= x[1]
+ ]
+ r_idx = [
+ j
+ for j, r in enumerate(table.rows)
+ if r[1] <= (bbox[1] + bbox[3]) / 2 <= r[0]
+ ]
+ r = r_idx[0]
+ x_cuts = [
+ (c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right
+ ]
+ if not x_cuts:
+ x_cuts = [(x_overlap[0], table.cells[r][-1].x2)]
+ for obj in textline._objs:
+ row = table.rows[r]
+ for cut in x_cuts:
+ if isinstance(obj, LTChar):
+ if (
+ row[1] <= (obj.y0 + obj.y1) / 2 <= row[0]
+ and (obj.x0 + obj.x1) / 2 <= cut[1]
+ ):
+ cut_text.append((r, cut[0], obj))
+ break
+ else:
+ # TODO: add test
+ if cut == x_cuts[-1]:
+ cut_text.append((r, cut[0] + 1, obj))
+ elif isinstance(obj, LTAnno):
+ cut_text.append((r, cut[0], obj))
+ elif direction == "vertical" and not textline.is_empty():
+ y_overlap = [
+ j
+ for j, y in enumerate(table.rows)
+ if y[1] <= bbox[3] and bbox[1] <= y[0]
+ ]
+ c_idx = [
+ i
+ for i, c in enumerate(table.cols)
+ if c[0] <= (bbox[0] + bbox[2]) / 2 <= c[1]
+ ]
+ c = c_idx[0]
+ y_cuts = [
+ (r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom
+ ]
+ if not y_cuts:
+ y_cuts = [(y_overlap[0], table.cells[-1][c].y1)]
+ for obj in textline._objs:
+ col = table.cols[c]
+ for cut in y_cuts:
+ if isinstance(obj, LTChar):
+ if (
+ col[0] <= (obj.x0 + obj.x1) / 2 <= col[1]
+ and (obj.y0 + obj.y1) / 2 >= cut[1]
+ ):
+ cut_text.append((cut[0], c, obj))
+ break
+ else:
+ # TODO: add test
+ if cut == y_cuts[-1]:
+ cut_text.append((cut[0] - 1, c, obj))
+ elif isinstance(obj, LTAnno):
+ cut_text.append((cut[0], c, obj))
+ except IndexError:
+ return [(-1, -1, textline.get_text())]
+ grouped_chars = []
+ for key, chars in groupby(cut_text, itemgetter(0, 1)):
+ if flag_size:
+ grouped_chars.append(
+ (
+ key[0],
+ key[1],
+ flag_font_size(
+ [t[2] for t in chars], direction, strip_text=strip_text
+ ),
+ )
+ )
+ else:
+ gchars = [t[2].get_text() for t in chars]
+ grouped_chars.append(
+ (key[0], key[1], text_strip("".join(gchars), strip_text))
+ )
+ return grouped_chars
+
+
+def get_table_index(
+ table, t, direction, split_text=False, flag_size=False, strip_text=""
+):
+ """Gets indices of the table cell where given text object lies by
+ comparing their y and x-coordinates.
+
+ Parameters
+ ----------
+ table : camelot.core.Table
+ t : object
+ PDFMiner LTTextLine object.
+ direction : string
+ Direction of the PDFMiner LTTextLine object.
+ split_text : bool, optional (default: False)
+ Whether or not to split a text line if it spans across
+ multiple cells.
+ flag_size : bool, optional (default: False)
+ Whether or not to highlight a substring using
+ if its size is different from rest of the string. (Useful for
+ super and subscripts)
+ strip_text : str, optional (default: '')
+ Characters that should be stripped from a string before
+ assigning it to a cell.
+
+ Returns
+ -------
+ indices : list
+ List of tuples of the form (r_idx, c_idx, text) where r_idx
+ and c_idx are row and column indices.
+ error : float
+ Assignment error, percentage of text area that lies outside
+ a cell.
+ +-------+
+ | |
+ | [Text bounding box]
+ | |
+ +-------+
+
+ """
+ r_idx, c_idx = [-1] * 2
+ for r in range(len(table.rows)):
+ if (t.y0 + t.y1) / 2.0 < table.rows[r][0] and (t.y0 + t.y1) / 2.0 > table.rows[
+ r
+ ][1]:
+ lt_col_overlap = []
+ for c in table.cols:
+ if c[0] <= t.x1 and c[1] >= t.x0:
+ left = t.x0 if c[0] <= t.x0 else c[0]
+ right = t.x1 if c[1] >= t.x1 else c[1]
+ lt_col_overlap.append(abs(left - right) / abs(c[0] - c[1]))
+ else:
+ lt_col_overlap.append(-1)
+ if len(list(filter(lambda x: x != -1, lt_col_overlap))) == 0:
+ text = t.get_text().strip("\n")
+ text_range = (t.x0, t.x1)
+ col_range = (table.cols[0][0], table.cols[-1][1])
+ warnings.warn(
+ f"{text} {text_range} does not lie in column range {col_range}"
+ )
+ r_idx = r
+ c_idx = lt_col_overlap.index(max(lt_col_overlap))
+ break
+
+ # error calculation
+ y0_offset, y1_offset, x0_offset, x1_offset = [0] * 4
+ if t.y0 > table.rows[r_idx][0]:
+ y0_offset = abs(t.y0 - table.rows[r_idx][0])
+ if t.y1 < table.rows[r_idx][1]:
+ y1_offset = abs(t.y1 - table.rows[r_idx][1])
+ if t.x0 < table.cols[c_idx][0]:
+ x0_offset = abs(t.x0 - table.cols[c_idx][0])
+ if t.x1 > table.cols[c_idx][1]:
+ x1_offset = abs(t.x1 - table.cols[c_idx][1])
+ X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1)
+ Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1)
+ charea = X * Y
+ error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea
+
+ if split_text:
+ return (
+ split_textline(
+ table, t, direction, flag_size=flag_size, strip_text=strip_text
+ ),
+ error,
+ )
+ else:
+ if flag_size:
+ return (
+ [
+ (
+ r_idx,
+ c_idx,
+ flag_font_size(t._objs, direction, strip_text=strip_text),
+ )
+ ],
+ error,
+ )
+ else:
+ return [(r_idx, c_idx, text_strip(t.get_text(), strip_text))], error
+
+
+def compute_accuracy(error_weights):
+ """Calculates a score based on weights assigned to various
+ parameters and their error percentages.
+
+ Parameters
+ ----------
+ error_weights : list
+ Two-dimensional list of the form [[p1, e1], [p2, e2], ...]
+ where pn is the weight assigned to list of errors en.
+ Sum of pn should be equal to 100.
+
+ Returns
+ -------
+ score : float
+
+ """
+ SCORE_VAL = 100
+ try:
+ score = 0
+ if sum([ew[0] for ew in error_weights]) != SCORE_VAL:
+ raise ValueError("Sum of weights should be equal to 100.")
+ for ew in error_weights:
+ weight = ew[0] / len(ew[1])
+ for error_percentage in ew[1]:
+ score += weight * (1 - error_percentage)
+ except ZeroDivisionError:
+ score = 0
+ return score
+
+
+def compute_whitespace(d):
+ """Calculates the percentage of empty strings in a
+ two-dimensional list.
+
+ Parameters
+ ----------
+ d : list
+
+ Returns
+ -------
+ whitespace : float
+ Percentage of empty cells.
+
+ """
+ whitespace = 0
+ r_nempty_cells, c_nempty_cells = [], []
+ for i in d:
+ for j in i:
+ if j.strip() == "":
+ whitespace += 1
+ whitespace = 100 * (whitespace / float(len(d) * len(d[0])))
+ return whitespace
+
+
+def get_page_layout(
+ filename,
+ line_overlap=0.5,
+ char_margin=1.0,
+ line_margin=0.5,
+ word_margin=0.1,
+ boxes_flow=0.5,
+ detect_vertical=True,
+ all_texts=True,
+):
+ """Returns a PDFMiner LTPage object and page dimension of a single
+ page pdf. To get the definitions of kwargs, see
+ https://pdfminersix.rtfd.io/en/latest/reference/composable.html.
+
+ Parameters
+ ----------
+ filename : string
+ Path to pdf file.
+ line_overlap : float
+ char_margin : float
+ line_margin : float
+ word_margin : float
+ boxes_flow : float
+ detect_vertical : bool
+ all_texts : bool
+
+ Returns
+ -------
+ layout : object
+ PDFMiner LTPage object.
+ dim : tuple
+ Dimension of pdf page in the form (width, height).
+
+ """
+ with open(filename, "rb") as f:
+ parser = PDFParser(f)
+ document = PDFDocument(parser)
+ if not document.is_extractable:
+ raise PDFTextExtractionNotAllowed(
+ f"Text extraction is not allowed: {filename}"
+ )
+ laparams = LAParams(
+ line_overlap=line_overlap,
+ char_margin=char_margin,
+ line_margin=line_margin,
+ word_margin=word_margin,
+ boxes_flow=boxes_flow,
+ detect_vertical=detect_vertical,
+ all_texts=all_texts,
+ )
+ rsrcmgr = PDFResourceManager()
+ device = PDFPageAggregator(rsrcmgr, laparams=laparams)
+ interpreter = PDFPageInterpreter(rsrcmgr, device)
+ for page in PDFPage.create_pages(document):
+ interpreter.process_page(page)
+ layout = device.get_result()
+ width = layout.bbox[2]
+ height = layout.bbox[3]
+ dim = (width, height)
+ return layout, dim
+
+
+def get_text_objects(layout, ltype="char", t=None):
+ """Recursively parses pdf layout to get a list of
+ PDFMiner text objects.
+
+ Parameters
+ ----------
+ layout : object
+ PDFMiner LTPage object.
+ ltype : string
+ Specify 'char', 'lh', 'lv' to get LTChar, LTTextLineHorizontal,
+ and LTTextLineVertical objects respectively.
+ t : list
+
+ Returns
+ -------
+ t : list
+ List of PDFMiner text objects.
+
+ """
+ if ltype == "char":
+ LTObject = LTChar
+ elif ltype == "image":
+ LTObject = LTImage
+ elif ltype == "horizontal_text":
+ LTObject = LTTextLineHorizontal
+ elif ltype == "vertical_text":
+ LTObject = LTTextLineVertical
+ if t is None:
+ t = []
+ try:
+ for obj in layout._objs:
+ if isinstance(obj, LTObject):
+ t.append(obj)
+ else:
+ t += get_text_objects(obj, ltype=ltype)
+ except AttributeError:
+ pass
+ return t
diff --git a/src/test/kotlin/com/github/darderion/mundaneassignmentpolice/TestsConfiguration.kt b/src/test/kotlin/com/github/darderion/mundaneassignmentpolice/TestsConfiguration.kt
index 3638eb72..4e832cc9 100644
--- a/src/test/kotlin/com/github/darderion/mundaneassignmentpolice/TestsConfiguration.kt
+++ b/src/test/kotlin/com/github/darderion/mundaneassignmentpolice/TestsConfiguration.kt
@@ -2,6 +2,6 @@ package com.github.darderion.mundaneassignmentpolice
class TestsConfiguration {
companion object {
- const val resourceFolder = "src/test/resources/com/github/darderion/mundaneassignmentpolice/"
+ const val resourceFolder = "src/main/python/src/test/resources/com/github/darderion/mundaneassignmentpolice/"
}
}
\ No newline at end of file
diff --git a/src/test/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/PDFDocumentTests.kt b/src/test/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/PDFDocumentTests.kt
index ad8c15bf..00d46e50 100644
--- a/src/test/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/PDFDocumentTests.kt
+++ b/src/test/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/PDFDocumentTests.kt
@@ -8,7 +8,7 @@ import com.github.darderion.mundaneassignmentpolice.wrapper.PDFBox
import io.kotest.core.spec.style.StringSpec
import io.kotest.inspectors.forAll
import io.kotest.matchers.shouldBe
-
+/*
class PDFDocumentTests: StringSpec({
"PDFDocument should contain TITLE_PAGE's lines" {
PDFDocument(text = lines).text.any { it.area == TITLE_PAGE } shouldBe true
@@ -152,3 +152,5 @@ class PDFDocumentTests: StringSpec({
)
}
}
+
+ */
diff --git a/src/test/python/TableExtractionScriptTest.py b/src/test/python/TableExtractionScriptTest.py
new file mode 100644
index 00000000..83ffccfa
--- /dev/null
+++ b/src/test/python/TableExtractionScriptTest.py
@@ -0,0 +1,60 @@
+import unittest
+import pandas
+import contextlib
+from pathlib import Path
+import io
+import os
+import sys
+import src.main.python.camelot
+from src.main.python.TableExtractionScript import extraction
+
+sys.path.insert(0, '../src')
+
+class TableExtractionScriptTest(unittest.TestCase):
+
+ def test_open_file(self):
+ pdf_path = 'src/test//resources/com/github/darderion/mundaneassignmentpolice/python/tableextractionscript/OpenNotPDF.docx'
+
+ s = io.StringIO()
+ with contextlib.redirect_stdout(s):
+ extraction(pdf_path)
+
+ self.assertEqual('invalid PDF file\n', s.getvalue())
+
+ def test_check_table_directory(self):
+ pdf_path = 'src/test/resources/com/github/darderion/mundaneassignmentpolice/python/tableextractionscript/TableInformation.pdf'
+ extraction(pdf_path)
+ self.assertTrue(os.path.exists(f'uploads/tables/{Path(pdf_path).stem}'))
+
+ def test_save_table(self):
+ pdf_path = 'src/test/resources/com/github/darderion/mundaneassignmentpolice/python/tableextractionscript/TableInformation.pdf'
+ extraction(pdf_path)
+ self.assertTrue(os.path.exists('uploads/tables/TableInformation/TableInformation-page-1-table-1.csv'))
+
+ def test_check_table_information(self):
+ pdf_path = 'src/test/resources/com/github/darderion/mundaneassignmentpolice/python/tableextractionscript/TableInformation.pdf'
+ extraction(pdf_path)
+ table = pandas.read_csv(os.path.expanduser("~/map/uploads/tables/TableInformation/TableInformation-page-1-table-1.csv"))
+ camelot_table = src.main.python.camelot.read_pdf(pdf_path, linescale=30)[0]
+ self.assertEqual('table data', table.columns[0])
+
+ self.assertEqual('table information', table['table data'][4])
+
+ self.assertEqual('page', table['table data'][5])
+ self.assertEqual('1', table['table data'][6])
+
+ self.assertEqual('table area', table['table data'][7])
+ self.assertEqual(camelot_table.cells[3][0].x1, float(table['table data'][8]))
+ self.assertEqual(camelot_table.cells[3][3].x2, float(table['table data'][10]))
+ self.assertEqual(camelot_table.cells[3][0].y1, float(table['table data'][9]))
+ self.assertEqual(camelot_table.cells[0][3].y2, float(table['table data'][11]))
+
+ self.assertEqual('rows', table['table data'][12])
+ self.assertEqual('4', table['table data'][13])
+
+ self.assertEqual('columns', table['table data'][14])
+ self.assertEqual('4', table['table data'][15])
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/src/test/python/camelot/camelot_py.py b/src/test/python/camelot/camelot_py.py
new file mode 100644
index 00000000..1424c35f
--- /dev/null
+++ b/src/test/python/camelot/camelot_py.py
@@ -0,0 +1,109 @@
+import os
+import unittest
+import sys
+sys.path.insert(0, '../src')
+import src.main.python.camelot as camelot
+from src.main.python.camelot.image_processing import (
+ intersectes
+)
+os.chdir(os.path.expanduser("~/map/src/test/resources/com/github/darderion/mundaneassignmentpolice/python/camelot"))
+
+
+class DrawingLines(unittest.TestCase):
+ def test_v_draw(self):
+ file_name = 'DrawingVerticalLines.pdf'
+
+ tables = camelot.read_pdf(file_name, latice=True, pages='1')
+ self.assertEqual(0, len(tables))
+
+ tables = camelot.read_pdf(file_name, latice=True, pages='2')
+ self.assertEqual(0, len(tables))
+
+ tables = camelot.read_pdf(file_name, latice=True, pages='3')
+ self.assertEqual(1, len(tables))
+ self.assertEqual(5, len(tables[0].cells))
+ self.assertEqual(1, len(tables[0].cols))
+ self.assertEqual(5, len(tables[0].rows))
+
+ tables = camelot.read_pdf(file_name, latice=True, pages='4')
+ self.assertEqual(3, len(tables))
+
+ self.assertEqual(2, len(tables[0].cells))
+ self.assertEqual(2, len(tables[1].cells))
+ self.assertEqual(2, len(tables[2].cells))
+
+ self.assertEqual(1, len(tables[0].cols))
+ self.assertEqual(1, len(tables[1].cols))
+ self.assertEqual(1, len(tables[2].cols))
+
+ self.assertEqual(2, len(tables[0].rows))
+ self.assertEqual(2, len(tables[1].rows))
+ self.assertEqual(2, len(tables[2].rows))
+
+ def test_h_draw(self):
+ file_name = 'DrawingHorizontalLines.pdf'
+
+ tables = camelot.read_pdf(file_name, latice=True, pages='1')
+ self.assertEqual(0, len(tables))
+
+ tables = camelot.read_pdf(file_name, latice=True, pages='2')
+ self.assertEqual(0, len(tables))
+
+ tables = camelot.read_pdf(file_name, latice=True, pages='3')
+ self.assertEqual(1, len(tables))
+
+ tables = camelot.read_pdf(file_name, latice=True, pages='4')
+ self.assertEqual(1, len(tables))
+
+ tables = camelot.read_pdf(file_name, latice=True, pages='5')
+ self.assertEqual(2, len(tables))
+
+ def test_intersects(self):
+ # rib1 intersects rib2 at first end
+ rib1, rib2 = (1, 100, 1, 5), (1, 5, 100, 5)
+ self.assertEqual(True, intersectes(rib1, rib2))
+
+ # rib1 intersects rib2 at second end
+ rib1, rib2 = (1, 100, 100, 100), (100, 100, 100, 5)
+ self.assertEqual(True, intersectes(rib1, rib2))
+
+ # horizontal rib1 parallel to horizontal rib2
+ rib1, rib2 = (1, 100, 5, 100), (1, 200, 5, 200)
+ self.assertEqual(False, intersectes(rib1, rib2))
+
+ # vertical rib1 parallel to vertical rib2
+ rib1, rib2 = (1, 100, 1, 200), (10, 100, 10, 200)
+ self.assertEqual(False, intersectes(rib1, rib2))
+
+ # rib1 intersects rib2 inside
+ rib1, rib2 = (1, 5, 100, 5), (50, 100, 50, 2)
+ self.assertEqual(True, intersectes(rib1, rib2))
+
+ # rib1 does not intersect rib2
+ rib1, rib2 = (5, 10, 100, 10), (50, 60, 50, 40)
+ self.assertEqual(False, intersectes(rib1, rib2))
+
+ # rib1 lies on the same line as rib2 and does not intersect rib2
+ rib1, rib2 = (5, 10, 100, 10), (150, 10, 160, 10)
+ self.assertEqual(False, intersectes(rib1, rib2))
+
+ def test_correct_lines(self):
+ file_name = 'DrawingComplexTables.pdf'
+
+ tables = camelot.read_pdf(file_name, latice=True, pages='1')
+ self.assertEqual(1, len(tables))
+
+ tables = camelot.read_pdf(file_name, latice=True, pages='2')
+ self.assertEqual(2, len(tables))
+
+
+ tables = camelot.read_pdf(file_name, latice=True, pages='3')
+ self.assertEqual(2, len(tables))
+
+ tables = camelot.read_pdf(file_name, latice=True, pages='4')
+ self.assertEqual(3, len(tables))
+
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/src/test/resources/com/github/darderion/mundaneassignmentpolice/python/camelot/DrawingComplexTables.pdf b/src/test/resources/com/github/darderion/mundaneassignmentpolice/python/camelot/DrawingComplexTables.pdf
new file mode 100644
index 00000000..9fce1ba4
Binary files /dev/null and b/src/test/resources/com/github/darderion/mundaneassignmentpolice/python/camelot/DrawingComplexTables.pdf differ
diff --git a/src/test/resources/com/github/darderion/mundaneassignmentpolice/python/camelot/DrawingHorizontalLines.pdf b/src/test/resources/com/github/darderion/mundaneassignmentpolice/python/camelot/DrawingHorizontalLines.pdf
new file mode 100644
index 00000000..2de1c1a2
Binary files /dev/null and b/src/test/resources/com/github/darderion/mundaneassignmentpolice/python/camelot/DrawingHorizontalLines.pdf differ
diff --git a/src/test/resources/com/github/darderion/mundaneassignmentpolice/python/camelot/DrawingVerticalLines.pdf b/src/test/resources/com/github/darderion/mundaneassignmentpolice/python/camelot/DrawingVerticalLines.pdf
new file mode 100644
index 00000000..c8526582
Binary files /dev/null and b/src/test/resources/com/github/darderion/mundaneassignmentpolice/python/camelot/DrawingVerticalLines.pdf differ
diff --git a/src/test/resources/com/github/darderion/mundaneassignmentpolice/python/tableextractionscript/OpenNotPDF.docx b/src/test/resources/com/github/darderion/mundaneassignmentpolice/python/tableextractionscript/OpenNotPDF.docx
new file mode 100644
index 00000000..f839a32b
Binary files /dev/null and b/src/test/resources/com/github/darderion/mundaneassignmentpolice/python/tableextractionscript/OpenNotPDF.docx differ
diff --git a/src/test/resources/com/github/darderion/mundaneassignmentpolice/python/tableextractionscript/TableInformation.pdf b/src/test/resources/com/github/darderion/mundaneassignmentpolice/python/tableextractionscript/TableInformation.pdf
new file mode 100644
index 00000000..d19b3964
Binary files /dev/null and b/src/test/resources/com/github/darderion/mundaneassignmentpolice/python/tableextractionscript/TableInformation.pdf differ
| |