diff --git a/package-lock.json b/package-lock.json index 8a23ca84..40eb5db9 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,5 +1,5 @@ { - "name": "mundane-assignment-police", + "name": "map", "lockfileVersion": 2, "requires": true, "packages": {} diff --git a/pom.xml b/pom.xml index c6177209..05953c1a 100644 --- a/pom.xml +++ b/pom.xml @@ -15,7 +15,7 @@ Web-app that assists in checking students' assignments 11 - 1.5.31 + 1.6.21 @@ -85,6 +85,11 @@ 5.0.0.M1 test + + org.jetbrains.kotlinx + dataframe + 0.8.0-dev-1005 + diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/RuleViolation.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/RuleViolation.kt index 1782c96e..fd37b0e1 100644 --- a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/RuleViolation.kt +++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/RuleViolation.kt @@ -13,3 +13,4 @@ data class RuleViolation( ) { // override fun toString() = if (lines.count() == 1) "[${lines.first().line}, p.${lines.first().page}] --> '$message'" else "" } + diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/list/ListRule.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/list/ListRule.kt index e8e2a712..ec2f7b2b 100644 --- a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/list/ListRule.kt +++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/list/ListRule.kt @@ -8,6 +8,7 @@ import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFArea.TABLE_OF import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFDocument import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFRegion import com.github.darderion.mundaneassignmentpolice.pdfdocument.list.PDFList +import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.Coordinate import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.Line class ListRule( @@ -26,7 +27,7 @@ class ListRule( document.areas!!.tableOfContents.map { document.text.filter { it.area == TABLE_OF_CONTENT }.firstOrNull { line -> line.content.contains(it) - }?: Line(0, 0, 0, listOf(), TABLE_OF_CONTENT) + }?: Line(0, 0, 0, listOf(), TABLE_OF_CONTENT, Coordinate(0,0)) } ) diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/symbol/BasicSymbolRule.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/symbol/BasicSymbolRule.kt index 41b5405c..c1414fe3 100644 --- a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/symbol/BasicSymbolRule.kt +++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/symbol/BasicSymbolRule.kt @@ -39,6 +39,7 @@ class BasicSymbolRule( when (direction) { LEFT -> sideTexts.removeAt(1) RIGHT -> sideTexts.removeAt(0) + else -> {} } val neighbors = (if (notIgnoredNeighbors.isNotEmpty()) sideTexts diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/table/TableRule.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/table/TableRule.kt new file mode 100644 index 00000000..697401e4 --- /dev/null +++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/table/TableRule.kt @@ -0,0 +1,32 @@ +package com.github.darderion.mundaneassignmentpolice.checker.rule.table + +import com.github.darderion.mundaneassignmentpolice.checker.RuleViolation +import com.github.darderion.mundaneassignmentpolice.checker.RuleViolationType +import com.github.darderion.mundaneassignmentpolice.checker.rule.Rule +import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFDocument +import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFRegion +import com.github.darderion.mundaneassignmentpolice.pdfdocument.tables.Table +import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.Line + +class TableRule ( + val predicates: MutableList<(Table) -> List>, + type: RuleViolationType, + area: PDFRegion, + name: String + ): Rule(area, name, type){ + override fun process(document: PDFDocument): List { + val rulesViolations: MutableSet = mutableSetOf() + + predicates.forEach { predicate -> + rulesViolations.addAll( + document.tables.map { + predicate(it) + }.filter { it.isNotEmpty() }.map { + RuleViolation(it, name, type) + } + ) + } + + return rulesViolations.toList() + } + } diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/table/TableRuleBuilder.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/table/TableRuleBuilder.kt new file mode 100644 index 00000000..c9acd21f --- /dev/null +++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/table/TableRuleBuilder.kt @@ -0,0 +1,18 @@ +package com.github.darderion.mundaneassignmentpolice.checker.rule.table + +import com.github.darderion.mundaneassignmentpolice.checker.RuleViolationType +import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFRegion +import com.github.darderion.mundaneassignmentpolice.pdfdocument.tables.Table +import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.Line + +class TableRuleBuilder { + private val predicates: MutableList<(Table) -> List> = mutableListOf() + private var type: RuleViolationType = RuleViolationType.Error + private var region: PDFRegion = PDFRegion.EVERYWHERE + private var name: String = "Rule name" + + fun called(name: String) = this.also { this.name = name } + + fun disallow(predicate: (table: Table) -> List) = this.also { predicates.add(predicate) } + fun getRule() = TableRule(predicates, type, region, name) +} \ No newline at end of file diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/word/BasicWordRule.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/word/BasicWordRule.kt index e5310909..e0f0ddd0 100644 --- a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/word/BasicWordRule.kt +++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/word/BasicWordRule.kt @@ -39,6 +39,7 @@ class BasicWordRule( when (direction) { Direction.LEFT -> sideWords.removeAt(1) Direction.RIGHT -> sideWords.removeAt(0) + else -> {} } val filteredSideWords = sideWords diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/Annotations.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/Annotations.kt index 721b9d2e..edc86d07 100644 --- a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/Annotations.kt +++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/Annotations.kt @@ -13,10 +13,12 @@ class Annotations { var document = PDFBox().getDocument(pdf.name) lines.forEach { line -> document = PDFBox().addLine(document, line.page, - Coordinate(line.position.x to (pdf.height - (line.text.maxOf { it.position.y } + 2))), - (pdf.width - (line.position.x + 50)).toInt() + Coordinate(line.startPosition.x to (pdf.height - (line.text.maxOf { it.position.y } + 2))), + (line.endPosition.x - line.startPosition.x).toInt() ) } + + Files.createDirectories(Paths.get("${pdfFolder}ruleviolations/")) val fileName = "${pdfFolder}ruleviolations/${ pdf.name.split('/')[pdf.name.split('/').count() - 1].replace(".pdf", "") @@ -24,5 +26,6 @@ class Annotations { document.save(fileName) return fileName } + } } diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/PDFDocument.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/PDFDocument.kt index a7c7e71b..e499ddd5 100644 --- a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/PDFDocument.kt +++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/PDFDocument.kt @@ -1,10 +1,12 @@ package com.github.darderion.mundaneassignmentpolice.pdfdocument +import com.github.darderion.mundaneassignmentpolice.pdfdocument.tables.Table import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.Line import mu.KotlinLogging class PDFDocument(val name: String = "PDF", val text: List, + val tables: List, val width: Double = defaultPageWidth, val height: Double = defaultPageHeight ) { diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/list/PDFList.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/list/PDFList.kt index c55826eb..3ce27e68 100644 --- a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/list/PDFList.kt +++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/list/PDFList.kt @@ -54,7 +54,7 @@ data class PDFList(val value: MutableList = mutableListOf(), val nodes: Mu */ fun getLists(lines: List): List> { // Adding a line to process a text that has no lines after a list - val lines = lines + Line(-1, -1, -1, listOf(Word("NOT A LIST ITEM", Font(0.0f), Coordinate(1000, -1)))) + val lines = lines + Line(-1, -1, -1, listOf(Word("NOT A LIST ITEM", Font(0.0f), Coordinate(1000, -1))), null, Coordinate(0,0)) val lists: MutableList> = mutableListOf() val stack: Stack> = Stack() @@ -69,11 +69,11 @@ data class PDFList(val value: MutableList = mutableListOf(), val nodes: Mu stack.push(stack.peek().nodes.first()) } } else { - previousPosition = stack.peek().value.first().position - if (previousPosition hasSameXAs line.position) { // 1. lorem OR lorem + previousPosition = stack.peek().value.first().startPosition + if (previousPosition hasSameXAs line.startPosition) { // 1. lorem OR lorem stack.peek().value.add(line) // lorem lorem } else { - if (previousPosition.x < line.position.x) { + if (previousPosition.x < line.startPosition.x) { if (isListItem(line)) { stack.peek().nodes.add(PDFList(line.drop(2))) // lorem stack.push(stack.peek().nodes.last()) // 1. lorem @@ -83,17 +83,17 @@ data class PDFList(val value: MutableList = mutableListOf(), val nodes: Mu } } else { // lorem OR lorem OR ... lorem OR ... lorem while (!( stack.isEmpty() || // lorem 2. lorem lorem 2. lorem - (isListItem(line) && previousPosition hasSameXAs line.drop(2).position) || - previousPosition hasSameXAs line.position)) { + (isListItem(line) && previousPosition hasSameXAs line.drop(2).startPosition) || + previousPosition hasSameXAs line.startPosition)) { previousList = stack.pop() if (stack.isNotEmpty()) { - previousPosition = stack.peek().value.first().position + previousPosition = stack.peek().value.first().startPosition } } if (stack.isEmpty()) { lists.add(previousList!!) } else { - if (previousPosition hasSameXAs line.position) { // lorem + if (previousPosition hasSameXAs line.startPosition) { // lorem stack.peek().value.add(line) // lorem } else { stack.pop() diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/tables/Cell.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/tables/Cell.kt new file mode 100644 index 00000000..880cd0ac --- /dev/null +++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/tables/Cell.kt @@ -0,0 +1,11 @@ +package com.github.darderion.mundaneassignmentpolice.pdfdocument.tables +import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.Coordinate +import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.Line + +data class Cell( + val page: Int, + val cellText: MutableList, + var cellLines: MutableList, + val leftCorner: Coordinate, + val rightCorner: Coordinate +) \ No newline at end of file diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/tables/Table.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/tables/Table.kt new file mode 100644 index 00000000..f41c97cc --- /dev/null +++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/tables/Table.kt @@ -0,0 +1,69 @@ +package com.github.darderion.mundaneassignmentpolice.pdfdocument.tables + +import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.* +import com.github.darderion.mundaneassignmentpolice.wrapper.PDFBox +import org.jetbrains.kotlinx.dataframe.AnyFrame +import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.api.* + +class Table(val df: DataFrame){ + + val page : Int + val x1 : Double + val y1 : Double + val x2 : Double + val y2 : Double + val rowCount : Int + val colCount : Int + val cells: MutableList = mutableListOf() + init { + val indexTableInf = df.select{ cols(0) }.last { it[0] == "table information"}.index() + val tableInf = df.select{cols(0)}.filter { it.index() >= indexTableInf } + + this.page = tableInf[pageTableIndex][0].toString().toInt() - 1 + this.x1 = tableInf[x1TableIndex][0].toString().toDouble() + this.y1 = defaultPageHeight - tableInf[y1TableIndex][0].toString().toDouble() + this.x2 = tableInf[x2TableIndex][0].toString().toDouble() + this.y2 = defaultPageHeight - tableInf[y2TableIndex][0].toString().toDouble() + this.rowCount = tableInf[rowTableIndex][0].toString().toInt() + this.colCount = tableInf[colTableIndex][0].toString().toInt() + val tableData = df.filter { it.index() < indexTableInf } + + tableData.forEachColumn { it.forEach { getCell(it.toString()) } } + } + + private fun getCell(text: String){ + + val coordinates = text.lines().first().split(" ") + val x1 = coordinates[x1CellIndex].toDouble() + val y1 = defaultPageHeight - coordinates[y1CellIndex].toDouble() + val x2 = coordinates[x2CellIndex].toDouble() + val y2 = defaultPageHeight - coordinates[y2CellIndex].toDouble() + + val cellText = text.lines().filterIndexed{ index, _ -> index > 0 }.toMutableList() + + cells.add(Cell(page, cellText, mutableListOf(), Coordinate(x1,y1), Coordinate(x2,y2))) + } + + fun getLines(): List{ + val lines = mutableListOf() + cells.forEach{ lines.addAll(it.cellLines) } + return lines + } + + companion object { + private const val defaultPageHeight = 842.0 + private const val x1CellIndex = 2 + private const val y1CellIndex = 5 + private const val x2CellIndex = 8 + private const val y2CellIndex = 11 + + private const val pageTableIndex = 2 + private const val x1TableIndex = 4 + private const val y1TableIndex = 5 + private const val x2TableIndex = 6 + private const val y2TableIndex = 7 + private const val rowTableIndex = 9 + private const val colTableIndex = 11 + } +} diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/text/Line.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/text/Line.kt index a7003cc6..021e112a 100644 --- a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/text/Line.kt +++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/text/Line.kt @@ -3,12 +3,12 @@ package com.github.darderion.mundaneassignmentpolice.pdfdocument.text import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFArea data class Line(val index: Int, val page: Int, val documentIndex: Int, - val text: List, var area: PDFArea? = null + val text: List, var area: PDFArea? = null, var endPosition: Coordinate ) { val content: String get() = text.joinToString("") { it.text } - val position: Coordinate + val startPosition: Coordinate get() = if (text.isNotEmpty()) text.first().position else Coordinate(0, 0) val first: String? @@ -17,7 +17,10 @@ data class Line(val index: Int, val page: Int, val documentIndex: Int, val second: String? get() = if (text.count() > 1) text[1].text else null - override fun toString() = "[$documentIndex -- $index, p.$page, $area, ${position.x}] --> '$content'" + override fun toString() = "[$documentIndex -- $index, p.$page, $area, ${startPosition.x}] --> '$content'" - fun drop(numberOfItems: Int) = Line(index, page, documentIndex, text.drop(numberOfItems), area) + fun drop(numberOfItems: Int) = Line(index, page, documentIndex, text.drop(numberOfItems), area, Coordinate(0,0)) + companion object{ + private const val defaultPageWidth = 595.22 + } } diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/rules/RuleSet.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/rules/RuleSet.kt index ec5708da..5195f93b 100644 --- a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/rules/RuleSet.kt +++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/rules/RuleSet.kt @@ -4,7 +4,9 @@ import com.github.darderion.mundaneassignmentpolice.checker.rule.Rule val RULE_SET_RU = RuleSet( mutableListOf( - RULE_LITLINK, + TABLE_RULE, + + /*RULE_LITLINK, RULE_SHORT_DASH, RULE_MEDIUM_DASH, RULE_LONG_DASH, @@ -22,9 +24,12 @@ val RULE_SET_RU = RuleSet( RULE_VARIOUS_ABBREVIATIONS, RULE_SECTIONS_ORDER, RULE_LOW_QUALITY_CONFERENCES, + + */ ) - + RULES_SPACE_AROUND_BRACKETS + /*+ RULES_SPACE_AROUND_BRACKETS + RULES_SMALL_NUMBERS -) + */ +) class RuleSet(val rules: List) {} diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/rules/Rules.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/rules/Rules.kt index 346e4a12..568e2cb9 100644 --- a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/rules/Rules.kt +++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/rules/Rules.kt @@ -7,6 +7,7 @@ import com.github.darderion.mundaneassignmentpolice.checker.rule.regex.RegexRule import com.github.darderion.mundaneassignmentpolice.checker.rule.symbol.SymbolRuleBuilder import com.github.darderion.mundaneassignmentpolice.checker.rule.symbol.and import com.github.darderion.mundaneassignmentpolice.checker.rule.symbol.or +import com.github.darderion.mundaneassignmentpolice.checker.rule.table.TableRuleBuilder import com.github.darderion.mundaneassignmentpolice.checker.rule.tableofcontent.TableOfContentRuleBuilder import com.github.darderion.mundaneassignmentpolice.checker.rule.url.URLRuleBuilder import com.github.darderion.mundaneassignmentpolice.checker.rule.url.then @@ -15,6 +16,7 @@ import com.github.darderion.mundaneassignmentpolice.checker.rule.word.WordRuleBu import com.github.darderion.mundaneassignmentpolice.checker.rule.word.or import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFArea import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFRegion +import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.Line import com.github.darderion.mundaneassignmentpolice.utils.InvalidOperationException import com.github.darderion.mundaneassignmentpolice.utils.LowQualityConferencesUtil import com.github.darderion.mundaneassignmentpolice.utils.ResourcesUtil @@ -416,3 +418,12 @@ val RULE_LOW_QUALITY_CONFERENCES = URLRuleBuilder() .any { conference -> url.text.contains(conference) } }.map { it to it.lines } }.getRule() + +val TABLE_RULE = TableRuleBuilder() + .called("Все клетки") + .disallow { table -> + val lines = mutableListOf() + table.cells.forEach { cell -> lines.addAll(cell.cellLines) } + lines + } + .getRule() \ No newline at end of file diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/wrapper/PDFBox.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/wrapper/PDFBox.kt index f4ca5596..3ed05144 100644 --- a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/wrapper/PDFBox.kt +++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/wrapper/PDFBox.kt @@ -1,6 +1,7 @@ package com.github.darderion.mundaneassignmentpolice.wrapper import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFDocument +import com.github.darderion.mundaneassignmentpolice.pdfdocument.tables.Table import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.* import com.github.darderion.mundaneassignmentpolice.utils.imgToBase64String import org.apache.pdfbox.pdmodel.PDDocument @@ -11,9 +12,19 @@ import org.apache.pdfbox.pdmodel.font.PDType1Font import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject import org.apache.pdfbox.text.PDFTextStripper +import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.io.read import java.awt.Color import java.awt.image.RenderedImage import java.io.* +import java.nio.file.Files +import java.nio.file.LinkOption +import java.util.* +import java.util.concurrent.TimeUnit +import kotlin.collections.ArrayList +import kotlin.collections.HashMap +import kotlin.collections.LinkedHashSet +import kotlin.io.path.Path class PDFBox { @@ -90,6 +101,8 @@ class PDFBox { * @return PDFDocument */ fun getPDF(fileName: String): PDFDocument { + val tables = getTables(fileName) + val pdfText: MutableList = mutableListOf() val document = getDocument(fileName) @@ -117,7 +130,7 @@ class PDFBox { var font: Font? var word: String var symb: Symbol - val words: MutableList = mutableListOf() + var words: MutableList = mutableListOf() var contentIndex: Int var contentItem: String var coordinates = Coordinate(0, 0) @@ -166,13 +179,51 @@ class PDFBox { if (font == null && word.isEmpty()) font = Font(0.0f) words.add(Word(word, font!!, coordinates)) - Line(line, pageIndex, lineIndex, words.toList()) - }) - } + tables.filter { table -> table.page == pageIndex }.forEach { table -> + words = words.filter { word -> !isWordInTable(pageIndex, word, table) } + .filter { it.text.isNotEmpty() }.toMutableList() + } + + if (document.pages[pageIndex].resources.xObjectNames.count() != 0){ + Line(line, pageIndex, lineIndex, words.toList(),null,Coordinate(0,0)) + } + else{ + Line(line, pageIndex, lineIndex, words.toList(),null,stripper.symbols[stripperIndex-1].position)} + } + ) + var line = text.lines().size + tables.forEach { table -> + if (table.page == pageIndex) + table.cells.forEach { cell -> + val cellLines = mutableListOf() + cellLines.addAll(cell.cellText.filter { it.isNotEmpty() }.map { content -> + words.clear() + content.split(" ").forEach { + words.add(Word(it, Font(12f), cell.leftCorner)) + } + lineIndex += 1 + line += 1 + val tableLine = Line(line, pageIndex, lineIndex, words.toList(), + endPosition = Coordinate(cell.rightCorner.x, cell.rightCorner.y)) + cell.cellLines = cellLines + pdfText.add(tableLine) + tableLine + } + + ) + } + } + } document.close() - return PDFDocument(fileName, pdfText, size.width.toDouble(), size.height.toDouble()) + return PDFDocument(fileName, pdfText, tables, size.width.toDouble(), size.height.toDouble()) + } + + private fun isWordInTable(page: Int, word: Word, table: Table): Boolean { + return page == table.page && + word.position.x >= table.x1 && word.position.y <= table.y1 && + word.position.x <= table.x2 && word.position.y >= table.y2 } fun getPDFSize(fileName: String): Int { @@ -209,4 +260,37 @@ class PDFBox { } return images } + + /** + * Returns tables from PDF + * @param path pdf's path + * @return list of Table + */ + fun getTables(path: String): List
{ + + val workingDirPath = System.getProperty("user.home") + "/map" + val fileName = path.replace("uploads/","") + val tables = mutableListOf
() + + if (!Files.exists(Path("$workingDirPath/uploads/tables/$fileName"), LinkOption.NOFOLLOW_LINKS)) { + + ProcessBuilder( + "src/main/python/venv/bin/python3", + "src/main/python/TableExtractionScript.py", + "extraction", path + ) + .directory(File(workingDirPath)) + .redirectOutput(ProcessBuilder.Redirect.INHERIT) + .start() + .waitFor() + } + + File("$workingDirPath/uploads/tables/$fileName/").walkBottomUp().filter { it.isFile }.forEach { + val df = DataFrame.read(it) + tables.add(Table(df)) + } + + return tables + } + } diff --git a/src/main/python/TableExtractionScript.py b/src/main/python/TableExtractionScript.py new file mode 100755 index 00000000..a42773de --- /dev/null +++ b/src/main/python/TableExtractionScript.py @@ -0,0 +1,53 @@ +import PyPDF2 +from PyPDF2.errors import PdfReadError +import src.main.python.camelot +import pandas +import os +import sys +from pathlib import Path +sys.path.insert(0, '../src') + + +def extraction(pdf_path): + + os.chdir(os.path.expanduser("~/map/")) + file_name = Path(pdf_path).stem + + try: + PyPDF2.PdfFileReader(open(pdf_path, 'rb')) + except PyPDF2.errors.PdfReadError: + print("invalid PDF file") + else: + if not os.path.isdir(f'uploads/tables/{file_name}'): + os.mkdir(f'uploads/tables/{file_name}') + + tables = src.main.python.camelot.read_pdf(pdf_path, latice=True, pages='all', line_scale=30) + + for k in range(len(tables)): + left_x, left_y, right_x, right_y = 596, 896, 0, 0 + for i in range(len(tables[k].cells)): + for j in range(len(tables[k].cells[i])): + left_x = min(left_x, tables[k].cells[i][j].x1) + left_y = min(left_y, tables[k].cells[i][j].y1) + right_x = max(right_x, tables[k].cells[i][j].x2) + right_y = max(right_y, tables[k].cells[i][j].y2) + tables[k].df.at[i, j] = f'x1 = {tables[k].cells[i][j].x1} ' \ + f'y1 = {tables[k].cells[i][j].y1} ' \ + f'x2 = {tables[k].cells[i][j].x2} ' \ + f'y2 = {tables[k].cells[i][j].y2} \n ' \ + + tables[k].df.at[i, j] + tables[k].df = pandas.concat([pandas.DataFrame(['table data']), tables[k].df, + pandas.DataFrame(['table information', + 'page', tables[k].page, + 'table area', left_x, left_y, right_x, right_y, + 'rows', len(tables[k].rows), + 'columns', len(tables[k].cols)], + )], + ignore_index=True) + tables.export(f'uploads/tables/{file_name}/{file_name}.csv', + f='csv', + compress=False) + + +if __name__ == '__main__': + globals()[sys.argv[1]](sys.argv[2]) diff --git a/src/main/python/camelot/__init__.py b/src/main/python/camelot/__init__.py new file mode 100755 index 00000000..bc4beb62 --- /dev/null +++ b/src/main/python/camelot/__init__.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- + +import logging + +from .__version__ import __version__ +from .io import read_pdf +from .plotting import PlotMethods + + +# set up logging +logger = logging.getLogger("camelot") + +format_string = "%(asctime)s - %(levelname)s - %(message)s" +formatter = logging.Formatter(format_string, datefmt="%Y-%m-%dT%H:%M:%S") +handler = logging.StreamHandler() +handler.setFormatter(formatter) + +logger.addHandler(handler) + +# instantiate plot method +plot = PlotMethods() diff --git a/src/main/python/camelot/__main__.py b/src/main/python/camelot/__main__.py new file mode 100644 index 00000000..ac90c95f --- /dev/null +++ b/src/main/python/camelot/__main__.py @@ -0,0 +1,14 @@ +# -*- coding: utf-8 -*- + + +__all__ = ("main",) + + +def main(): + from src.main.python.camelot.cli import cli + + cli() + + +if __name__ == "__main__": + main() diff --git a/src/main/python/camelot/__version__.py b/src/main/python/camelot/__version__.py new file mode 100644 index 00000000..72364b92 --- /dev/null +++ b/src/main/python/camelot/__version__.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +VERSION = (0, 11, 0) +PRERELEASE = None # alpha, beta or rc +REVISION = None + + +def generate_version(version, prerelease=None, revision=None): + version_parts = [".".join(map(str, version))] + if prerelease is not None: + version_parts.append(f"-{prerelease}") + if revision is not None: + version_parts.append(f".{revision}") + return "".join(version_parts) + + +__title__ = "camelot-py" +__description__ = "PDF Table Extraction for Humans." +__url__ = "http://camelot-py.readthedocs.io/" +__version__ = generate_version(VERSION, prerelease=PRERELEASE, revision=REVISION) +__author__ = "Vinayak Mehta" +__author_email__ = "vmehta94@gmail.com" +__license__ = "MIT License" diff --git a/src/main/python/camelot/backends/__init__.py b/src/main/python/camelot/backends/__init__.py new file mode 100644 index 00000000..8d0b91e9 --- /dev/null +++ b/src/main/python/camelot/backends/__init__.py @@ -0,0 +1,3 @@ +# -*- coding: utf-8 -*- + +from .image_conversion import ImageConversionBackend diff --git a/src/main/python/camelot/backends/ghostscript_backend.py b/src/main/python/camelot/backends/ghostscript_backend.py new file mode 100644 index 00000000..1de7da19 --- /dev/null +++ b/src/main/python/camelot/backends/ghostscript_backend.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- + +import sys +import ctypes +from ctypes.util import find_library + + +def installed_posix(): + library = find_library("gs") + return library is not None + + +def installed_windows(): + library = find_library( + "".join(("gsdll", str(ctypes.sizeof(ctypes.c_voidp) * 8), ".dll")) + ) + return library is not None + + +class GhostscriptBackend(object): + def installed(self): + if sys.platform in ["linux", "darwin"]: + return installed_posix() + elif sys.platform == "win32": + return installed_windows() + else: + return installed_posix() + + def convert(self, pdf_path, png_path, resolution=300): + if not self.installed(): + raise OSError( + "Ghostscript is not installed. You can install it using the instructions" + " here: https://camelot-py.readthedocs.io/en/master/user/install-deps.html" + ) + + import ghostscript + + gs_command = [ + "gs", + "-q", + "-sDEVICE=png16m", + "-o", + png_path, + f"-r{resolution}", + pdf_path, + ] + ghostscript.Ghostscript(*gs_command) diff --git a/src/main/python/camelot/backends/image_conversion.py b/src/main/python/camelot/backends/image_conversion.py new file mode 100644 index 00000000..7d2c4d7a --- /dev/null +++ b/src/main/python/camelot/backends/image_conversion.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- + +from .poppler_backend import PopplerBackend +from .ghostscript_backend import GhostscriptBackend + +BACKENDS = {"poppler": PopplerBackend, "ghostscript": GhostscriptBackend} + + +class ImageConversionBackend(object): + def __init__(self, backend="poppler", use_fallback=True): + if backend not in BACKENDS.keys(): + raise ValueError(f"Image conversion backend '{backend}' not supported") + + self.backend = backend + self.use_fallback = use_fallback + self.fallbacks = list(filter(lambda x: x != backend, BACKENDS.keys())) + + def convert(self, pdf_path, png_path): + try: + converter = BACKENDS[self.backend]() + converter.convert(pdf_path, png_path) + except Exception as e: + import sys + + if self.use_fallback: + for fallback in self.fallbacks: + try: + converter = BACKENDS[fallback]() + converter.convert(pdf_path, png_path) + except Exception as e: + raise type(e)( + str(e) + f" with image conversion backend '{fallback}'" + ).with_traceback(sys.exc_info()[2]) + continue + else: + break + else: + raise type(e)( + str(e) + f" with image conversion backend '{self.backend}'" + ).with_traceback(sys.exc_info()[2]) diff --git a/src/main/python/camelot/backends/poppler_backend.py b/src/main/python/camelot/backends/poppler_backend.py new file mode 100644 index 00000000..41033729 --- /dev/null +++ b/src/main/python/camelot/backends/poppler_backend.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- + +import shutil +import subprocess + + +class PopplerBackend(object): + def convert(self, pdf_path, png_path): + pdftopng_executable = shutil.which("pdftopng") + if pdftopng_executable is None: + raise OSError( + "pdftopng is not installed. You can install it using the 'pip install pdftopng' command." + ) + + pdftopng_command = [pdftopng_executable, pdf_path, png_path] + + try: + subprocess.check_output( + " ".join(pdftopng_command), stderr=subprocess.STDOUT, shell=True + ) + except subprocess.CalledProcessError as e: + raise ValueError(e.output) diff --git a/src/main/python/camelot/cli.py b/src/main/python/camelot/cli.py new file mode 100644 index 00000000..546a32d8 --- /dev/null +++ b/src/main/python/camelot/cli.py @@ -0,0 +1,304 @@ +# -*- coding: utf-8 -*- + +import logging + +import click + +try: + import matplotlib.pyplot as plt +except ImportError: + _HAS_MPL = False +else: + _HAS_MPL = True + +from . import __version__, read_pdf, plot + + +logger = logging.getLogger("camelot") +logger.setLevel(logging.INFO) + + +class Config(object): + def __init__(self): + self.config = {} + + def set_config(self, key, value): + self.config[key] = value + + +pass_config = click.make_pass_decorator(Config) + + +@click.group(name="camelot") +@click.version_option(version=__version__) +@click.option("-q", "--quiet", is_flag=False, help="Suppress logs and warnings.") +@click.option( + "-p", + "--pages", + default="1", + help="Comma-separated page numbers." " Example: 1,3,4 or 1,4-end or all.", +) +@click.option("-pw", "--password", help="Password for decryption.") +@click.option("-o", "--output", help="Output file path.") +@click.option( + "-f", + "--format", + type=click.Choice(["csv", "excel", "html", "json", "markdown", "sqlite"]), + help="Output file format.", +) +@click.option("-z", "--zip", is_flag=True, help="Create ZIP archive.") +@click.option( + "-split", + "--split_text", + is_flag=True, + help="Split text that spans across multiple cells.", +) +@click.option( + "-flag", + "--flag_size", + is_flag=True, + help="Flag text based on" " font size. Useful to detect super/subscripts.", +) +@click.option( + "-strip", + "--strip_text", + help="Characters that should be stripped from a string before" + " assigning it to a cell.", +) +@click.option( + "-M", + "--margins", + nargs=3, + default=(1.0, 0.5, 0.1), + help="PDFMiner char_margin, line_margin and word_margin.", +) +@click.pass_context +def cli(ctx, *args, **kwargs): + """Camelot: PDF Table Extraction for Humans""" + ctx.obj = Config() + for key, value in kwargs.items(): + ctx.obj.set_config(key, value) + + +@cli.command("lattice") +@click.option( + "-R", + "--table_regions", + default=[], + multiple=True, + help="Page regions to analyze. Example: x1,y1,x2,y2" + " where x1, y1 -> left-top and x2, y2 -> right-bottom.", +) +@click.option( + "-T", + "--table_areas", + default=[], + multiple=True, + help="Table areas to process. Example: x1,y1,x2,y2" + " where x1, y1 -> left-top and x2, y2 -> right-bottom.", +) +@click.option( + "-back", "--process_background", is_flag=True, help="Process background lines." +) +@click.option( + "-scale", + "--line_scale", + default=15, + help="Line size scaling factor. The larger the value," + " the smaller the detected lines.", +) +@click.option( + "-copy", + "--copy_text", + default=[], + type=click.Choice(["h", "v"]), + multiple=True, + help="Direction in which text in a spanning cell" " will be copied over.", +) +@click.option( + "-shift", + "--shift_text", + default=["l", "t"], + type=click.Choice(["", "l", "r", "t", "b"]), + multiple=True, + help="Direction in which text in a spanning cell will flow.", +) +@click.option( + "-l", + "--line_tol", + default=2, + help="Tolerance parameter used to merge close vertical" " and horizontal lines.", +) +@click.option( + "-j", + "--joint_tol", + default=2, + help="Tolerance parameter used to decide whether" + " the detected lines and points lie close to each other.", +) +@click.option( + "-block", + "--threshold_blocksize", + default=15, + help="For adaptive thresholding, size of a pixel" + " neighborhood that is used to calculate a threshold value for" + " the pixel. Example: 3, 5, 7, and so on.", +) +@click.option( + "-const", + "--threshold_constant", + default=-2, + help="For adaptive thresholding, constant subtracted" + " from the mean or weighted mean. Normally, it is positive but" + " may be zero or negative as well.", +) +@click.option( + "-I", + "--iterations", + default=0, + help="Number of times for erosion/dilation will be applied.", +) +@click.option( + "-res", + "--resolution", + default=300, + help="Resolution used for PDF to PNG conversion.", +) +@click.option( + "-plot", + "--plot_type", + type=click.Choice(["text", "grid", "contour", "joint", "line"]), + help="Plot elements found on PDF page for visual debugging.", +) +@click.argument("filepath", type=click.Path(exists=True)) +@pass_config +def lattice(c, *args, **kwargs): + """Use lines between text to parse the table.""" + conf = c.config + pages = conf.pop("pages") + output = conf.pop("output") + f = conf.pop("format") + compress = conf.pop("zip") + quiet = conf.pop("quiet") + plot_type = kwargs.pop("plot_type") + filepath = kwargs.pop("filepath") + kwargs.update(conf) + + table_regions = list(kwargs["table_regions"]) + kwargs["table_regions"] = None if not table_regions else table_regions + table_areas = list(kwargs["table_areas"]) + kwargs["table_areas"] = None if not table_areas else table_areas + copy_text = list(kwargs["copy_text"]) + kwargs["copy_text"] = None if not copy_text else copy_text + kwargs["shift_text"] = list(kwargs["shift_text"]) + + if plot_type is not None: + if not _HAS_MPL: + raise ImportError("matplotlib is required for plotting.") + else: + if output is None: + raise click.UsageError("Please specify output file path using --output") + if f is None: + raise click.UsageError("Please specify output file format using --format") + + tables = read_pdf( + filepath, pages=pages, flavor="lattice", suppress_stdout=quiet, **kwargs + ) + click.echo(f"Found {tables.n} tables") + if plot_type is not None: + for table in tables: + plot(table, kind=plot_type) + plt.show() + else: + tables.export(output, f=f, compress=compress) + + +@cli.command("stream") +@click.option( + "-R", + "--table_regions", + default=[], + multiple=True, + help="Page regions to analyze. Example: x1,y1,x2,y2" + " where x1, y1 -> left-top and x2, y2 -> right-bottom.", +) +@click.option( + "-T", + "--table_areas", + default=[], + multiple=True, + help="Table areas to process. Example: x1,y1,x2,y2" + " where x1, y1 -> left-top and x2, y2 -> right-bottom.", +) +@click.option( + "-C", + "--columns", + default=[], + multiple=True, + help="X coordinates of column separators.", +) +@click.option( + "-e", + "--edge_tol", + default=50, + help="Tolerance parameter" " for extending textedges vertically.", +) +@click.option( + "-r", + "--row_tol", + default=2, + help="Tolerance parameter" " used to combine text vertically, to generate rows.", +) +@click.option( + "-c", + "--column_tol", + default=0, + help="Tolerance parameter" + " used to combine text horizontally, to generate columns.", +) +@click.option( + "-plot", + "--plot_type", + type=click.Choice(["text", "grid", "contour", "textedge"]), + help="Plot elements found on PDF page for visual debugging.", +) +@click.argument("filepath", type=click.Path(exists=True)) +@pass_config +def stream(c, *args, **kwargs): + """Use spaces between text to parse the table.""" + conf = c.config + pages = conf.pop("pages") + output = conf.pop("output") + f = conf.pop("format") + compress = conf.pop("zip") + quiet = conf.pop("quiet") + plot_type = kwargs.pop("plot_type") + filepath = kwargs.pop("filepath") + kwargs.update(conf) + + table_regions = list(kwargs["table_regions"]) + kwargs["table_regions"] = None if not table_regions else table_regions + table_areas = list(kwargs["table_areas"]) + kwargs["table_areas"] = None if not table_areas else table_areas + columns = list(kwargs["columns"]) + kwargs["columns"] = None if not columns else columns + + if plot_type is not None: + if not _HAS_MPL: + raise ImportError("matplotlib is required for plotting.") + else: + if output is None: + raise click.UsageError("Please specify output file path using --output") + if f is None: + raise click.UsageError("Please specify output file format using --format") + + tables = read_pdf( + filepath, pages=pages, flavor="stream", suppress_stdout=quiet, **kwargs + ) + click.echo(f"Found {tables.n} tables") + if plot_type is not None: + for table in tables: + plot(table, kind=plot_type) + plt.show() + else: + tables.export(output, f=f, compress=compress) diff --git a/src/main/python/camelot/core.py b/src/main/python/camelot/core.py new file mode 100644 index 00000000..58a98efd --- /dev/null +++ b/src/main/python/camelot/core.py @@ -0,0 +1,764 @@ +# -*- coding: utf-8 -*- + +import os +import sqlite3 +import zipfile +import tempfile +from itertools import chain +from operator import itemgetter + +import numpy as np +import pandas as pd + + +# minimum number of vertical textline intersections for a textedge +# to be considered valid +TEXTEDGE_REQUIRED_ELEMENTS = 4 +# padding added to table area on the left, right and bottom +TABLE_AREA_PADDING = 10 + + +class TextEdge(object): + """Defines a text edge coordinates relative to a left-bottom + origin. (PDF coordinate space) + + Parameters + ---------- + x : float + x-coordinate of the text edge. + y0 : float + y-coordinate of bottommost point. + y1 : float + y-coordinate of topmost point. + align : string, optional (default: 'left') + {'left', 'right', 'middle'} + + Attributes + ---------- + intersections: int + Number of intersections with horizontal text rows. + is_valid: bool + A text edge is valid if it intersections with at least + TEXTEDGE_REQUIRED_ELEMENTS horizontal text rows. + + """ + + def __init__(self, x, y0, y1, align="left"): + self.x = x + self.y0 = y0 + self.y1 = y1 + self.align = align + self.intersections = 0 + self.is_valid = False + + def __repr__(self): + x = round(self.x, 2) + y0 = round(self.y0, 2) + y1 = round(self.y1, 2) + return ( + f"" + ) + + def update_coords(self, x, y0, edge_tol=50): + """Updates the text edge's x and bottom y coordinates and sets + the is_valid attribute. + """ + if np.isclose(self.y0, y0, atol=edge_tol): + self.x = (self.intersections * self.x + x) / float(self.intersections + 1) + self.y0 = y0 + self.intersections += 1 + # a textedge is valid only if it extends uninterrupted + # over a required number of textlines + if self.intersections > TEXTEDGE_REQUIRED_ELEMENTS: + self.is_valid = True + + +class TextEdges(object): + """Defines a dict of left, right and middle text edges found on + the PDF page. The dict has three keys based on the alignments, + and each key's value is a list of camelot.core.TextEdge objects. + """ + + def __init__(self, edge_tol=50): + self.edge_tol = edge_tol + self._textedges = {"left": [], "right": [], "middle": []} + + @staticmethod + def get_x_coord(textline, align): + """Returns the x coordinate of a text row based on the + specified alignment. + """ + x_left = textline.x0 + x_right = textline.x1 + x_middle = x_left + (x_right - x_left) / 2.0 + x_coord = {"left": x_left, "middle": x_middle, "right": x_right} + return x_coord[align] + + def find(self, x_coord, align): + """Returns the index of an existing text edge using + the specified x coordinate and alignment. + """ + for i, te in enumerate(self._textedges[align]): + if np.isclose(te.x, x_coord, atol=0.5): + return i + return None + + def add(self, textline, align): + """Adds a new text edge to the current dict.""" + x = self.get_x_coord(textline, align) + y0 = textline.y0 + y1 = textline.y1 + te = TextEdge(x, y0, y1, align=align) + self._textedges[align].append(te) + + def update(self, textline): + """Updates an existing text edge in the current dict.""" + for align in ["left", "right", "middle"]: + x_coord = self.get_x_coord(textline, align) + idx = self.find(x_coord, align) + if idx is None: + self.add(textline, align) + else: + self._textedges[align][idx].update_coords( + x_coord, textline.y0, edge_tol=self.edge_tol + ) + + def generate(self, textlines): + """Generates the text edges dict based on horizontal text + rows. + """ + for tl in textlines: + if len(tl.get_text().strip()) > 1: # TODO: hacky + self.update(tl) + + def get_relevant(self): + """Returns the list of relevant text edges (all share the same + alignment) based on which list intersects horizontal text rows + the most. + """ + intersections_sum = { + "left": sum( + te.intersections for te in self._textedges["left"] if te.is_valid + ), + "right": sum( + te.intersections for te in self._textedges["right"] if te.is_valid + ), + "middle": sum( + te.intersections for te in self._textedges["middle"] if te.is_valid + ), + } + + # TODO: naive + # get vertical textedges that intersect maximum number of + # times with horizontal textlines + relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0] + return self._textedges[relevant_align] + + def get_table_areas(self, textlines, relevant_textedges): + """Returns a dict of interesting table areas on the PDF page + calculated using relevant text edges. + """ + + def pad(area, average_row_height): + x0 = area[0] - TABLE_AREA_PADDING + y0 = area[1] - TABLE_AREA_PADDING + x1 = area[2] + TABLE_AREA_PADDING + # add a constant since table headers can be relatively up + y1 = area[3] + average_row_height * 5 + return (x0, y0, x1, y1) + + # sort relevant textedges in reading order + relevant_textedges.sort(key=lambda te: (-te.y0, te.x)) + + table_areas = {} + for te in relevant_textedges: + if te.is_valid: + if not table_areas: + table_areas[(te.x, te.y0, te.x, te.y1)] = None + else: + found = None + for area in table_areas: + # check for overlap + if te.y1 >= area[1] and te.y0 <= area[3]: + found = area + break + if found is None: + table_areas[(te.x, te.y0, te.x, te.y1)] = None + else: + table_areas.pop(found) + updated_area = ( + found[0], + min(te.y0, found[1]), + max(found[2], te.x), + max(found[3], te.y1), + ) + table_areas[updated_area] = None + + # extend table areas based on textlines that overlap + # vertically. it's possible that these textlines were + # eliminated during textedges generation since numbers and + # chars/words/sentences are often aligned differently. + # drawback: table areas that have paragraphs on their sides + # will include the paragraphs too. + sum_textline_height = 0 + for tl in textlines: + sum_textline_height += tl.y1 - tl.y0 + found = None + for area in table_areas: + # check for overlap + if tl.y0 >= area[1] and tl.y1 <= area[3]: + found = area + break + if found is not None: + table_areas.pop(found) + updated_area = ( + min(tl.x0, found[0]), + min(tl.y0, found[1]), + max(found[2], tl.x1), + max(found[3], tl.y1), + ) + table_areas[updated_area] = None + average_textline_height = sum_textline_height / float(len(textlines)) + + # add some padding to table areas + table_areas_padded = {} + for area in table_areas: + table_areas_padded[pad(area, average_textline_height)] = None + + return table_areas_padded + + +class Cell(object): + """Defines a cell in a table with coordinates relative to a + left-bottom origin. (PDF coordinate space) + + Parameters + ---------- + x1 : float + x-coordinate of left-bottom point. + y1 : float + y-coordinate of left-bottom point. + x2 : float + x-coordinate of right-top point. + y2 : float + y-coordinate of right-top point. + + Attributes + ---------- + lb : tuple + Tuple representing left-bottom coordinates. + lt : tuple + Tuple representing left-top coordinates. + rb : tuple + Tuple representing right-bottom coordinates. + rt : tuple + Tuple representing right-top coordinates. + left : bool + Whether or not cell is bounded on the left. + right : bool + Whether or not cell is bounded on the right. + top : bool + Whether or not cell is bounded on the top. + bottom : bool + Whether or not cell is bounded on the bottom. + hspan : bool + Whether or not cell spans horizontally. + vspan : bool + Whether or not cell spans vertically. + text : string + Text assigned to cell. + + """ + + def __init__(self, x1, y1, x2, y2): + self.x1 = x1 + self.y1 = y1 + self.x2 = x2 + self.y2 = y2 + self.lb = (x1, y1) + self.lt = (x1, y2) + self.rb = (x2, y1) + self.rt = (x2, y2) + self.left = False + self.right = False + self.top = False + self.bottom = False + self.hspan = False + self.vspan = False + self._text = "" + + def __repr__(self): + x1 = round(self.x1) + y1 = round(self.y1) + x2 = round(self.x2) + y2 = round(self.y2) + return f"" + + @property + def text(self): + return self._text + + @text.setter + def text(self, t): + self._text = "".join([self._text, t]) + + @property + def bound(self): + """The number of sides on which the cell is bounded.""" + return self.top + self.bottom + self.left + self.right + + +class Table(object): + """Defines a table with coordinates relative to a left-bottom + origin. (PDF coordinate space) + + Parameters + ---------- + cols : list + List of tuples representing column x-coordinates in increasing + order. + rows : list + List of tuples representing row y-coordinates in decreasing + order. + + Attributes + ---------- + df : :class:`pandas.DataFrame` + shape : tuple + Shape of the table. + accuracy : float + Accuracy with which text was assigned to the cell. + whitespace : float + Percentage of whitespace in the table. + order : int + Table number on PDF page. + page : int + PDF page number. + + """ + + def __init__(self, cols, rows): + self.cols = cols + self.rows = rows + self.cells = [[Cell(c[0], r[1], c[1], r[0]) for c in cols] for r in rows] + self.df = None + self.shape = (0, 0) + self.accuracy = 0 + self.whitespace = 0 + self.order = None + self.page = None + + def __repr__(self): + return f"<{self.__class__.__name__} shape={self.shape}>" + + def __lt__(self, other): + if self.page == other.page: + if self.order < other.order: + return True + if self.page < other.page: + return True + + @property + def data(self): + """Returns two-dimensional list of strings in table.""" + d = [] + for row in self.cells: + d.append([cell.text.strip() for cell in row]) + return d + + @property + def parsing_report(self): + """Returns a parsing report with %accuracy, %whitespace, + table number on page and page number. + """ + # pretty? + report = { + "accuracy": round(self.accuracy, 2), + "whitespace": round(self.whitespace, 2), + "order": self.order, + "page": self.page, + } + return report + + def set_all_edges(self): + """Sets all table edges to True.""" + for row in self.cells: + for cell in row: + cell.left = cell.right = cell.top = cell.bottom = True + return self + + def set_edges(self, vertical, horizontal, joint_tol=2): + """Sets a cell's edges to True depending on whether the cell's + coordinates overlap with the line's coordinates within a + tolerance. + + Parameters + ---------- + vertical : list + List of detected vertical lines. + horizontal : list + List of detected horizontal lines. + + """ + for v in vertical: + # find closest x coord + # iterate over y coords and find closest start and end points + i = [ + i + for i, t in enumerate(self.cols) + if np.isclose(v[0], t[0], atol=joint_tol) + ] + j = [ + j + for j, t in enumerate(self.rows) + if np.isclose(v[3], t[0], atol=joint_tol) + ] + k = [ + k + for k, t in enumerate(self.rows) + if np.isclose(v[1], t[0], atol=joint_tol) + ] + if not j: + continue + J = j[0] + if i == [0]: # only left edge + L = i[0] + if k: + K = k[0] + while J < K: + self.cells[J][L].left = True + J += 1 + else: + K = len(self.rows) + while J < K: + self.cells[J][L].left = True + J += 1 + elif i == []: # only right edge + L = len(self.cols) - 1 + if k: + K = k[0] + while J < K: + self.cells[J][L].right = True + J += 1 + else: + K = len(self.rows) + while J < K: + self.cells[J][L].right = True + J += 1 + else: # both left and right edges + L = i[0] + if k: + K = k[0] + while J < K: + self.cells[J][L].left = True + self.cells[J][L - 1].right = True + J += 1 + else: + K = len(self.rows) + while J < K: + self.cells[J][L].left = True + self.cells[J][L - 1].right = True + J += 1 + + for h in horizontal: + # find closest y coord + # iterate over x coords and find closest start and end points + i = [ + i + for i, t in enumerate(self.rows) + if np.isclose(h[1], t[0], atol=joint_tol) + ] + j = [ + j + for j, t in enumerate(self.cols) + if np.isclose(h[0], t[0], atol=joint_tol) + ] + k = [ + k + for k, t in enumerate(self.cols) + if np.isclose(h[2], t[0], atol=joint_tol) + ] + if not j: + continue + J = j[0] + if i == [0]: # only top edge + L = i[0] + if k: + K = k[0] + while J < K: + self.cells[L][J].top = True + J += 1 + else: + K = len(self.cols) + while J < K: + self.cells[L][J].top = True + J += 1 + elif i == []: # only bottom edge + L = len(self.rows) - 1 + if k: + K = k[0] + while J < K: + self.cells[L][J].bottom = True + J += 1 + else: + K = len(self.cols) + while J < K: + self.cells[L][J].bottom = True + J += 1 + else: # both top and bottom edges + L = i[0] + if k: + K = k[0] + while J < K: + self.cells[L][J].top = True + self.cells[L - 1][J].bottom = True + J += 1 + else: + K = len(self.cols) + while J < K: + self.cells[L][J].top = True + self.cells[L - 1][J].bottom = True + J += 1 + + return self + + def set_border(self): + """Sets table border edges to True.""" + for r in range(len(self.rows)): + self.cells[r][0].left = True + self.cells[r][len(self.cols) - 1].right = True + for c in range(len(self.cols)): + self.cells[0][c].top = True + self.cells[len(self.rows) - 1][c].bottom = True + return self + + def set_span(self): + """Sets a cell's hspan or vspan attribute to True depending + on whether the cell spans horizontally or vertically. + """ + for row in self.cells: + for cell in row: + left = cell.left + right = cell.right + top = cell.top + bottom = cell.bottom + if cell.bound == 4: + continue + elif cell.bound == 3: + if not left and (right and top and bottom): + cell.hspan = True + elif not right and (left and top and bottom): + cell.hspan = True + elif not top and (left and right and bottom): + cell.vspan = True + elif not bottom and (left and right and top): + cell.vspan = True + elif cell.bound == 2: + if left and right and (not top and not bottom): + cell.vspan = True + elif top and bottom and (not left and not right): + cell.hspan = True + elif cell.bound in [0, 1]: + cell.vspan = True + cell.hspan = True + return self + + def to_csv(self, path, **kwargs): + """Writes Table to a comma-separated values (csv) file. + + For kwargs, check :meth:`pandas.DataFrame.to_csv`. + + Parameters + ---------- + path : str + Output filepath. + + """ + kw = {"encoding": "utf-8", "index": False, "header": False, "quoting": 1} + kw.update(kwargs) + self.df.to_csv(path, **kw) + + def to_json(self, path, **kwargs): + """Writes Table to a JSON file. + + For kwargs, check :meth:`pandas.DataFrame.to_json`. + + Parameters + ---------- + path : str + Output filepath. + + """ + kw = {"orient": "records"} + kw.update(kwargs) + json_string = self.df.to_json(**kw) + with open(path, "w") as f: + f.write(json_string) + + def to_excel(self, path, **kwargs): + """Writes Table to an Excel file. + + For kwargs, check :meth:`pandas.DataFrame.to_excel`. + + Parameters + ---------- + path : str + Output filepath. + + """ + kw = { + "sheet_name": f"page-{self.page}-table-{self.order}", + "encoding": "utf-8", + } + kw.update(kwargs) + writer = pd.ExcelWriter(path) + self.df.to_excel(writer, **kw) + writer.save() + + def to_html(self, path, **kwargs): + """Writes Table to an HTML file. + + For kwargs, check :meth:`pandas.DataFrame.to_html`. + + Parameters + ---------- + path : str + Output filepath. + + """ + html_string = self.df.to_html(**kwargs) + with open(path, "w", encoding="utf-8") as f: + f.write(html_string) + + def to_markdown(self, path, **kwargs): + """Writes Table to a Markdown file. + + For kwargs, check :meth:`pandas.DataFrame.to_markdown`. + + Parameters + ---------- + path : str + Output filepath. + + """ + md_string = self.df.to_markdown(**kwargs) + with open(path, "w", encoding="utf-8") as f: + f.write(md_string) + + def to_sqlite(self, path, **kwargs): + """Writes Table to sqlite database. + + For kwargs, check :meth:`pandas.DataFrame.to_sql`. + + Parameters + ---------- + path : str + Output filepath. + + """ + kw = {"if_exists": "replace", "index": False} + kw.update(kwargs) + conn = sqlite3.connect(path) + table_name = f"page-{self.page}-table-{self.order}" + self.df.to_sql(table_name, conn, **kw) + conn.commit() + conn.close() + + +class TableList(object): + """Defines a list of camelot.core.Table objects. Each table can + be accessed using its index. + + Attributes + ---------- + n : int + Number of tables in the list. + + """ + + def __init__(self, tables): + self._tables = tables + + def __repr__(self): + return f"<{self.__class__.__name__} n={self.n}>" + + def __len__(self): + return len(self._tables) + + def __getitem__(self, idx): + return self._tables[idx] + + @staticmethod + def _format_func(table, f): + return getattr(table, f"to_{f}") + + @property + def n(self): + return len(self) + + def _write_file(self, f=None, **kwargs): + dirname = kwargs.get("dirname") + root = kwargs.get("root") + ext = kwargs.get("ext") + for table in self._tables: + filename = f"{root}-page-{table.page}-table-{table.order}{ext}" + filepath = os.path.join(dirname, filename) + to_format = self._format_func(table, f) + to_format(filepath) + + def _compress_dir(self, **kwargs): + path = kwargs.get("path") + dirname = kwargs.get("dirname") + root = kwargs.get("root") + ext = kwargs.get("ext") + zipname = os.path.join(os.path.dirname(path), root) + ".zip" + with zipfile.ZipFile(zipname, "w", allowZip64=True) as z: + for table in self._tables: + filename = f"{root}-page-{table.page}-table-{table.order}{ext}" + filepath = os.path.join(dirname, filename) + z.write(filepath, os.path.basename(filepath)) + + def export(self, path, f="csv", compress=False): + """Exports the list of tables to specified file format. + + Parameters + ---------- + path : str + Output filepath. + f : str + File format. Can be csv, excel, html, json, markdown or sqlite. + compress : bool + Whether or not to add files to a ZIP archive. + + """ + dirname = os.path.dirname(path) + basename = os.path.basename(path) + root, ext = os.path.splitext(basename) + if compress: + dirname = tempfile.mkdtemp() + + kwargs = {"path": path, "dirname": dirname, "root": root, "ext": ext} + + if f in ["csv", "html", "json", "markdown"]: + self._write_file(f=f, **kwargs) + if compress: + self._compress_dir(**kwargs) + elif f == "excel": + filepath = os.path.join(dirname, basename) + writer = pd.ExcelWriter(filepath) + for table in self._tables: + sheet_name = f"page-{table.page}-table-{table.order}" + table.df.to_excel(writer, sheet_name=sheet_name, encoding="utf-8") + writer.save() + if compress: + zipname = os.path.join(os.path.dirname(path), root) + ".zip" + with zipfile.ZipFile(zipname, "w", allowZip64=True) as z: + z.write(filepath, os.path.basename(filepath)) + elif f == "sqlite": + filepath = os.path.join(dirname, basename) + for table in self._tables: + table.to_sqlite(filepath) + if compress: + zipname = os.path.join(os.path.dirname(path), root) + ".zip" + with zipfile.ZipFile(zipname, "w", allowZip64=True) as z: + z.write(filepath, os.path.basename(filepath)) diff --git a/src/main/python/camelot/handlers.py b/src/main/python/camelot/handlers.py new file mode 100644 index 00000000..3feadb60 --- /dev/null +++ b/src/main/python/camelot/handlers.py @@ -0,0 +1,180 @@ +# -*- coding: utf-8 -*- + +import os +import sys + +from pypdf import PdfReader, PdfWriter + +from .core import TableList +from .parsers import Stream, Lattice +from .utils import ( + TemporaryDirectory, + get_page_layout, + get_text_objects, + get_rotation, + is_url, + download_url, +) + + +class PDFHandler(object): + """Handles all operations like temp directory creation, splitting + file into single page PDFs, parsing each PDF and then removing the + temp directory. + + Parameters + ---------- + filepath : str + Filepath or URL of the PDF file. + pages : str, optional (default: '1') + Comma-separated page numbers. + Example: '1,3,4' or '1,4-end' or 'all'. + password : str, optional (default: None) + Password for decryption. + + """ + + def __init__(self, filepath, pages="1", password=None): + if is_url(filepath): + filepath = download_url(filepath) + self.filepath = filepath + #if not filepath.lower().endswith(".pdf"): + # raise NotImplementedError("File format not supported") + + if password is None: + self.password = "" + else: + self.password = password + if sys.version_info[0] < 3: + self.password = self.password.encode("ascii") + self.pages = self._get_pages(pages) + + def _get_pages(self, pages): + """Converts pages string to list of ints. + + Parameters + ---------- + filepath : str + Filepath or URL of the PDF file. + pages : str, optional (default: '1') + Comma-separated page numbers. + Example: '1,3,4' or '1,4-end' or 'all'. + + Returns + ------- + P : list + List of int page numbers. + + """ + page_numbers = [] + + if pages == "1": + page_numbers.append({"start": 1, "end": 1}) + else: + with open(self.filepath, "rb") as f: + infile = PdfReader(f, strict=False) + + if infile.is_encrypted: + infile.decrypt(self.password) + + if pages == "all": + page_numbers.append({"start": 1, "end": len(infile.pages)}) + else: + for r in pages.split(","): + if "-" in r: + a, b = r.split("-") + if b == "end": + b = len(infile.pages) + page_numbers.append({"start": int(a), "end": int(b)}) + else: + page_numbers.append({"start": int(r), "end": int(r)}) + + P = [] + for p in page_numbers: + P.extend(range(p["start"], p["end"] + 1)) + return sorted(set(P)) + + def _save_page(self, filepath, page, temp): + """Saves specified page from PDF into a temporary directory. + + Parameters + ---------- + filepath : str + Filepath or URL of the PDF file. + page : int + Page number. + temp : str + Tmp directory. + + """ + with open(filepath, "rb") as fileobj: + infile = PdfReader(fileobj, strict=False) + if infile.is_encrypted: + infile.decrypt(self.password) + fpath = os.path.join(temp, f"page-{page}.pdf") + froot, fext = os.path.splitext(fpath) + p = infile.pages[page - 1] + outfile = PdfWriter() + outfile.add_page(p) + with open(fpath, "wb") as f: + outfile.write(f) + layout, dim = get_page_layout(fpath) + # fix rotated PDF + chars = get_text_objects(layout, ltype="char") + horizontal_text = get_text_objects(layout, ltype="horizontal_text") + vertical_text = get_text_objects(layout, ltype="vertical_text") + rotation = get_rotation(chars, horizontal_text, vertical_text) + if rotation != "": + fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext]) + os.rename(fpath, fpath_new) + instream = open(fpath_new, "rb") + infile = PdfReader(instream, strict=False) + if infile.is_encrypted: + infile.decrypt(self.password) + outfile = PdfWriter() + p = infile.pages[0] + if rotation == "anticlockwise": + p.rotate(90) + elif rotation == "clockwise": + p.rotate(-90) + outfile.add_page(p) + with open(fpath, "wb") as f: + outfile.write(f) + instream.close() + + def parse( + self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs + ): + """Extracts tables by calling parser.get_tables on all single + page PDFs. + + Parameters + ---------- + flavor : str (default: 'lattice') + The parsing method to use ('lattice' or 'stream'). + Lattice is used by default. + suppress_stdout : str (default: False) + Suppress logs and warnings. + layout_kwargs : dict, optional (default: {}) + A dict of `pdfminer.layout.LAParams `_ kwargs. + kwargs : dict + See camelot.read_pdf kwargs. + + Returns + ------- + tables : camelot.core.TableList + List of tables found in PDF. + + """ + tables = [] + with TemporaryDirectory() as tempdir: + for p in self.pages: + self._save_page(self.filepath, p, tempdir) + pages = [os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages] + parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs) + for p in pages: + t = parser.extract_tables( + p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs + ) + tables.extend(t) + return TableList(sorted(tables)) diff --git a/src/main/python/camelot/image_processing.py b/src/main/python/camelot/image_processing.py new file mode 100644 index 00000000..08aae1b5 --- /dev/null +++ b/src/main/python/camelot/image_processing.py @@ -0,0 +1,399 @@ +# -*- coding: utf-8 -*- + +import cv2 +import numpy as np + +def adaptive_threshold_with_img(img, process_background=False, blocksize=15, c=-2): + gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + + if process_background: + threshold = cv2.adaptiveThreshold( + gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c + ) + else: + threshold = cv2.adaptiveThreshold( + np.invert(gray), + 255, + cv2.ADAPTIVE_THRESH_GAUSSIAN_C, + cv2.THRESH_BINARY, + blocksize, + c, + ) + return img, threshold + +def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2): + """Thresholds an image using OpenCV's adaptiveThreshold. + + Parameters + ---------- + imagename : string + Path to image file. + process_background : bool, optional (default: False) + Whether or not to process lines that are in background. + blocksize : int, optional (default: 15) + Size of a pixel neighborhood that is used to calculate a + threshold value for the pixel: 3, 5, 7, and so on. + + For more information, refer `OpenCV's adaptiveThreshold `_. + c : int, optional (default: -2) + Constant subtracted from the mean or weighted mean. + Normally, it is positive but may be zero or negative as well. + + For more information, refer `OpenCV's adaptiveThreshold `_. + + Returns + ------- + img : object + numpy.ndarray representing the original image. + threshold : object + numpy.ndarray representing the thresholded image. + + """ + img = cv2.imread(imagename) + img, threshold = adaptive_threshold_with_img(img, process_background, blocksize, c) + return img, threshold + + +def find_lines( + threshold, regions=None, direction="horizontal", line_scale=15, iterations=0 +): + """Finds horizontal and vertical lines by applying morphological + transformations on an image. + + Parameters + ---------- + threshold : object + numpy.ndarray representing the thresholded image. + regions : list, optional (default: None) + List of page regions that may contain tables of the form x1,y1,x2,y2 + where (x1, y1) -> left-top and (x2, y2) -> right-bottom + in image coordinate space. + direction : string, optional (default: 'horizontal') + Specifies whether to find vertical or horizontal lines. + line_scale : int, optional (default: 15) + Factor by which the page dimensions will be divided to get + smallest length of lines that should be detected. + + The larger this value, smaller the detected lines. Making it + too large will lead to text being detected as lines. + iterations : int, optional (default: 0) + Number of times for erosion/dilation is applied. + + For more information, refer `OpenCV's dilate `_. + + Returns + ------- + dmask : object + numpy.ndarray representing pixels where vertical/horizontal + lines lie. + lines : list + List of tuples representing vertical/horizontal lines with + coordinates relative to a left-top origin in + image coordinate space. + + """ + lines = [] + + if direction == "vertical": + size = threshold.shape[0] // line_scale + el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size)) + elif direction == "horizontal": + size = threshold.shape[1] // line_scale + el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1)) + elif direction is None: + raise ValueError("Specify direction as either 'vertical' or 'horizontal'") + + if regions is not None: + region_mask = np.zeros(threshold.shape) + for region in regions: + x, y, w, h = region + region_mask[y : y + h, x : x + w] = 1 + threshold = np.multiply(threshold, region_mask) + + threshold = cv2.erode(threshold, el) + threshold = cv2.dilate(threshold, el) + dmask = cv2.dilate(threshold, el, iterations=iterations) + + try: + _, contours, _ = cv2.findContours( + threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE + ) + except ValueError: + # for opencv backward compatibility + contours, _ = cv2.findContours( + threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE + ) + + for c in contours: + x, y, w, h = cv2.boundingRect(c) + x1, x2 = x, x + w + y1, y2 = y, y + h + if direction == "vertical": + lines.append(((x1 + x2) // 2, y2, (x1 + x2) // 2, y1)) + elif direction == "horizontal": + lines.append((x1, (y1 + y2) // 2, x2, (y1 + y2) // 2)) + + return dmask, lines + + +def find_contours(vertical, horizontal): + """Finds table boundaries using OpenCV's findContours. + + Parameters + ---------- + vertical : object + numpy.ndarray representing pixels where vertical lines lie. + horizontal : object + numpy.ndarray representing pixels where horizontal lines lie. + + Returns + ------- + cont : list + List of tuples representing table boundaries. Each tuple is of + the form (x, y, w, h) where (x, y) -> left-top, w -> width and + h -> height in image coordinate space. + + """ + mask = vertical + horizontal + + try: + __, contours, __ = cv2.findContours( + mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE + ) + except ValueError: + # for opencv backward compatibility + contours, __ = cv2.findContours( + mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE + ) + # sort in reverse based on contour area and use first 10 contours + contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10] + + cont = [] + for c in contours: + c_poly = cv2.approxPolyDP(c, 3, True) + x, y, w, h = cv2.boundingRect(c_poly) + cont.append((x, y, w, h)) + return cont + + +def find_joints(contours, vertical, horizontal): + """Finds joints/intersections present inside each table boundary. + + Parameters + ---------- + contours : list + List of tuples representing table boundaries. Each tuple is of + the form (x, y, w, h) where (x, y) -> left-top, w -> width and + h -> height in image coordinate space. + vertical : object + numpy.ndarray representing pixels where vertical lines lie. + horizontal : object + numpy.ndarray representing pixels where horizontal lines lie. + + Returns + ------- + tables : dict + Dict with table boundaries as keys and list of intersections + in that boundary as their value. + Keys are of the form (x1, y1, x2, y2) where (x1, y1) -> lb + and (x2, y2) -> rt in image coordinate space. + + """ + joints = np.multiply(vertical, horizontal) + tables = {} + for c in contours: + x, y, w, h = c + roi = joints[y : y + h, x : x + w] + try: + __, jc, __ = cv2.findContours( + roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE + ) + except ValueError: + # for opencv backward compatibility + jc, __ = cv2.findContours( + roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE + ) + if len(jc) <= 4: # remove contours with less than 4 joints + continue + joint_coords = [] + for j in jc: + jx, jy, jw, jh = cv2.boundingRect(j) + c1, c2 = x + (2 * jx + jw) // 2, y + (2 * jy + jh) // 2 + joint_coords.append((c1, c2)) + tables[(x, y + h, x + w, y)] = joint_coords + + return tables + + +def intersectes(r1, r2): + """ Checking the intersection of two ribs. + + :param r1: tuple + (x11, y11, x21, y21) where (x11, y11) -> start coordinates of r1 + and (x21, y21) -> end coordinates of rib1. + :param r2: tuple + (x12, y12, x22, y22) where (x12, y12) -> start coordinates of r2 + and (x22, y22) -> end coordinates of rib2. + :return: boolean + if ribs intersect True else False. + """ + c_m = 10 + x11, y11, x21, y21 = r1[0], r1[1], r1[2], r1[3] + x12, y12, x22, y22 = r2[0], r2[1], r2[2], r2[3] + + if (x11 == x21 and x12 == x22) or (y11 == y21 and y12 == y22): + return False + elif x11 == x21 and y12 == y22: + return x11 + c_m >= x12 and x11 <= x22 + c_m \ + and y11 + c_m >= y12 >= y21 - c_m + else: + return x12 + c_m >= x11 and x12 <= x21 + c_m \ + and y12 + c_m >= y11 >= y22 - c_m + + +def draw_v(image, h_lines): + """ + Draws the vertical lines between given horisontal lines, corrects the image. + + :param image: img : object + numpy.ndarray representing the image. + :param h_lines: list + List of tuples representing horizontal lines with coordinates. + :return: img : object + numpy.ndarray representing the new image. + """ + + if len(h_lines) > 0: + + h_lines = sorted(h_lines, key=lambda x: (x[0], x[1])) + + l_x, r_x = h_lines[0][0], h_lines[0][2] + u_y, d_y = h_lines[0][1], h_lines[0][1] + + for i in range(len(h_lines)): + + if l_x == h_lines[i][0] and i != len(h_lines) - 1: + r_x = max(r_x, h_lines[i][2]) + + elif l_x == h_lines[i][0]: + d_y = h_lines[i][3] + cv2.rectangle(image, pt1=(l_x, u_y), pt2=(r_x, d_y), color=(0, 0, 0), thickness=3) + + else: + d_y = h_lines[i - 1][3] + cv2.rectangle(image, pt1=(l_x, u_y), pt2=(r_x, d_y), color=(0, 0, 0), thickness=3) + l_x, r_x = h_lines[i][0], h_lines[i][2] + u_y, d_y = h_lines[i][1], h_lines[i][3] + + + return image + + +def draw_h(image, v_lines): + ''' + Draws the horisontal lines between given vertical lines, corrects the image. + + :param image: img : object + numpy.ndarray representing the image. + :param v_lines: list + List of tuples representing vertical lines with + coordinates. + :return: image : object + numpy.ndarray representing the new image. + ''' + if (len(v_lines) > 0): + v_lines = sorted(v_lines, key=lambda x: (x[3], x[0])) + + u_y, d_y = v_lines[0][3], v_lines[0][1] + + for i in range(len(v_lines)): + + if u_y == v_lines[i][3] and i != len(v_lines) - 1: + d_y = max(d_y, v_lines[i][1]) + + elif u_y == v_lines[i][3]: + d_y = max(d_y, v_lines[i][1]) + cv2.rectangle(image, pt1=(50, u_y), pt2=(image.shape[1] - 50, d_y), color=(0, 0, 0), thickness=3) + + else: + cv2.rectangle(image, pt1=(50, u_y), pt2=(image.shape[1] - 50, d_y), color=(0, 0, 0), thickness=3) + u_y, d_y = v_lines[i][3], v_lines[i][1] + + return image + +def correct_lines(image, v_segments, h_segments): + ''' + + :param image: object + numpy.ndarray representing the image. + :param v_segments: list + List of tuples representing vertical lines with + coordinates. + :param h_segments: list + List of tuples representing horizontal lines with + coordinates. + :return: image : object + numpy.ndarray representing the new image. + ''' + + h_size, v_size = len(h_segments), len(v_segments) + + if h_size > 1 and v_size == 0: + image = draw_v(image, h_segments) + + elif h_size == 0 and v_size > 1: + image = draw_h(image, v_segments) + + elif v_size >= 1 and h_size >= 1: + + ribs = v_segments[:] + h_segments[:] + segments = [[ribs[i]][:] for i in range(len(ribs))] + + for i in range(0, len(ribs) - 1): + for j in range(i+1, len(ribs)): + if intersectes(ribs[i],ribs[j]): + for sg1 in segments: + cur_sg = [] + if ribs[i] in sg1: + cur_sg = sg1 + break + + for sg2 in segments: + del_sg = [] + if ribs[j] in sg2 and cur_sg != sg2: + cur_sg += sg2[:] + del_sg = sg2 + break + if del_sg in segments: + segments.remove(del_sg) + + + s_lines = [] + + for i in range(len(segments)): + + min_x, min_y = segments[i][0][0], segments[i][0][3] + max_x, max_y = segments[i][0][2], segments[i][0][1] + + if len(segments[i]) > 1: + for line in segments[i]: + min_x, min_y = min(min_x, line[0]),min(min_y, line[3]) + max_x, max_y = max(max_x, line[2]), max(max_y,line[1]) + cv2.rectangle(image, pt1=(min_x, min_y), pt2=(max_x, max_y), color=(0, 0, 0), thickness=3) + else: + s_lines += segments[i] + + h_s_lines, v_s_lines = [], [] + + for line in s_lines: + v_s_lines.append(line) if line[0] == line[2] else h_s_lines.append(line) + + image = draw_h(image, v_s_lines) + image = draw_v(image, h_s_lines) + + '''cv2.imshow("Image", image) + cv2.waitKey(0) + cv2.destroyAllWindows() + ''' + return image + diff --git a/src/main/python/camelot/io.py b/src/main/python/camelot/io.py new file mode 100644 index 00000000..a27a7c66 --- /dev/null +++ b/src/main/python/camelot/io.py @@ -0,0 +1,119 @@ +# -*- coding: utf-8 -*- + +import warnings + +from .handlers import PDFHandler +from .utils import validate_input, remove_extra + + +def read_pdf( + filepath, + pages="1", + password=None, + flavor="lattice", + suppress_stdout=False, + layout_kwargs={}, + **kwargs +): + """Read PDF and return extracted tables. + + Note: kwargs annotated with ^ can only be used with flavor='stream' + and kwargs annotated with * can only be used with flavor='lattice'. + + Parameters + ---------- + filepath : str + Filepath or URL of the PDF file. + pages : str, optional (default: '1') + Comma-separated page numbers. + Example: '1,3,4' or '1,4-end' or 'all'. + password : str, optional (default: None) + Password for decryption. + flavor : str (default: 'lattice') + The parsing method to use ('lattice' or 'stream'). + Lattice is used by default. + suppress_stdout : bool, optional (default: True) + Print all logs and warnings. + layout_kwargs : dict, optional (default: {}) + A dict of `pdfminer.layout.LAParams `_ kwargs. + table_areas : list, optional (default: None) + List of table area strings of the form x1,y1,x2,y2 + where (x1, y1) -> left-top and (x2, y2) -> right-bottom + in PDF coordinate space. + columns^ : list, optional (default: None) + List of column x-coordinates strings where the coordinates + are comma-separated. + split_text : bool, optional (default: False) + Split text that spans across multiple cells. + flag_size : bool, optional (default: False) + Flag text based on font size. Useful to detect + super/subscripts. Adds around flagged text. + strip_text : str, optional (default: '') + Characters that should be stripped from a string before + assigning it to a cell. + row_tol^ : int, optional (default: 2) + Tolerance parameter used to combine text vertically, + to generate rows. + column_tol^ : int, optional (default: 0) + Tolerance parameter used to combine text horizontally, + to generate columns. + process_background* : bool, optional (default: False) + Process background lines. + line_scale* : int, optional (default: 15) + Line size scaling factor. The larger the value the smaller + the detected lines. Making it very large will lead to text + being detected as lines. + copy_text* : list, optional (default: None) + {'h', 'v'} + Direction in which text in a spanning cell will be copied + over. + shift_text* : list, optional (default: ['l', 't']) + {'l', 'r', 't', 'b'} + Direction in which text in a spanning cell will flow. + line_tol* : int, optional (default: 2) + Tolerance parameter used to merge close vertical and horizontal + lines. + joint_tol* : int, optional (default: 2) + Tolerance parameter used to decide whether the detected lines + and points lie close to each other. + threshold_blocksize* : int, optional (default: 15) + Size of a pixel neighborhood that is used to calculate a + threshold value for the pixel: 3, 5, 7, and so on. + + For more information, refer `OpenCV's adaptiveThreshold `_. + threshold_constant* : int, optional (default: -2) + Constant subtracted from the mean or weighted mean. + Normally, it is positive but may be zero or negative as well. + + For more information, refer `OpenCV's adaptiveThreshold `_. + iterations* : int, optional (default: 0) + Number of times for erosion/dilation is applied. + + For more information, refer `OpenCV's dilate `_. + resolution* : int, optional (default: 300) + Resolution used for PDF to PNG conversion. + + Returns + ------- + tables : camelot.core.TableList + + """ + if flavor not in ["lattice", "stream"]: + raise NotImplementedError( + "Unknown flavor specified." " Use either 'lattice' or 'stream'" + ) + + with warnings.catch_warnings(): + if suppress_stdout: + warnings.simplefilter("ignore") + + validate_input(kwargs, flavor=flavor) + p = PDFHandler(filepath, pages=pages, password=password) + kwargs = remove_extra(kwargs, flavor=flavor) + tables = p.parse( + flavor=flavor, + suppress_stdout=suppress_stdout, + layout_kwargs=layout_kwargs, + **kwargs + ) + return tables diff --git a/src/main/python/camelot/parsers/__init__.py b/src/main/python/camelot/parsers/__init__.py new file mode 100644 index 00000000..5cc66051 --- /dev/null +++ b/src/main/python/camelot/parsers/__init__.py @@ -0,0 +1,4 @@ +# -*- coding: utf-8 -*- + +from .stream import Stream +from .lattice import Lattice diff --git a/src/main/python/camelot/parsers/base.py b/src/main/python/camelot/parsers/base.py new file mode 100644 index 00000000..aeba056f --- /dev/null +++ b/src/main/python/camelot/parsers/base.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- + +import os + +from ..utils import get_page_layout, get_text_objects + + +class BaseParser(object): + """Defines a base parser.""" + + def _generate_layout(self, filename, layout_kwargs): + self.filename = filename + self.layout_kwargs = layout_kwargs + self.layout, self.dimensions = get_page_layout(filename, **layout_kwargs) + self.images = get_text_objects(self.layout, ltype="image") + self.horizontal_text = get_text_objects(self.layout, ltype="horizontal_text") + self.vertical_text = get_text_objects(self.layout, ltype="vertical_text") + self.pdf_width, self.pdf_height = self.dimensions + self.rootname, __ = os.path.splitext(self.filename) + self.imagename = "".join([self.rootname, ".png"]) diff --git a/src/main/python/camelot/parsers/lattice.py b/src/main/python/camelot/parsers/lattice.py new file mode 100644 index 00000000..5d8a79c8 --- /dev/null +++ b/src/main/python/camelot/parsers/lattice.py @@ -0,0 +1,464 @@ +# -*- coding: utf-8 -*- + +import os +import sys +import copy +import locale +import logging +import warnings + +import numpy as np +import pandas as pd + +from .base import BaseParser +from ..core import Table +from ..utils import ( + scale_image, + scale_pdf, + segments_in_bbox, + text_in_bbox, + merge_close_lines, + get_table_index, + compute_accuracy, + compute_whitespace, +) +from ..image_processing import ( + adaptive_threshold, + find_lines, + find_contours, + find_joints, + correct_lines, + adaptive_threshold_with_img, +) +from ..backends.image_conversion import BACKENDS + + +logger = logging.getLogger("camelot") + + +class Lattice(BaseParser): + """Lattice method of parsing looks for lines between text + to parse the table. + + Parameters + ---------- + table_regions : list, optional (default: None) + List of page regions that may contain tables of the form x1,y1,x2,y2 + where (x1, y1) -> left-top and (x2, y2) -> right-bottom + in PDF coordinate space. + table_areas : list, optional (default: None) + List of table area strings of the form x1,y1,x2,y2 + where (x1, y1) -> left-top and (x2, y2) -> right-bottom + in PDF coordinate space. + process_background : bool, optional (default: False) + Process background lines. + line_scale : int, optional (default: 15) + Line size scaling factor. The larger the value the smaller + the detected lines. Making it very large will lead to text + being detected as lines. + copy_text : list, optional (default: None) + {'h', 'v'} + Direction in which text in a spanning cell will be copied + over. + shift_text : list, optional (default: ['l', 't']) + {'l', 'r', 't', 'b'} + Direction in which text in a spanning cell will flow. + split_text : bool, optional (default: False) + Split text that spans across multiple cells. + flag_size : bool, optional (default: False) + Flag text based on font size. Useful to detect + super/subscripts. Adds around flagged text. + strip_text : str, optional (default: '') + Characters that should be stripped from a string before + assigning it to a cell. + line_tol : int, optional (default: 2) + Tolerance parameter used to merge close vertical and horizontal + lines. + joint_tol : int, optional (default: 2) + Tolerance parameter used to decide whether the detected lines + and points lie close to each other. + threshold_blocksize : int, optional (default: 15) + Size of a pixel neighborhood that is used to calculate a + threshold value for the pixel: 3, 5, 7, and so on. + + For more information, refer `OpenCV's adaptiveThreshold `_. + threshold_constant : int, optional (default: -2) + Constant subtracted from the mean or weighted mean. + Normally, it is positive but may be zero or negative as well. + + For more information, refer `OpenCV's adaptiveThreshold `_. + iterations : int, optional (default: 0) + Number of times for erosion/dilation is applied. + + For more information, refer `OpenCV's dilate `_. + resolution : int, optional (default: 300) + Resolution used for PDF to PNG conversion. + + """ + + def __init__( + self, + table_regions=None, + table_areas=None, + process_background=False, + line_scale=15, + copy_text=None, + shift_text=["l", "t"], + split_text=False, + flag_size=False, + strip_text="", + line_tol=2, + joint_tol=2, + threshold_blocksize=15, + threshold_constant=-2, + iterations=0, + resolution=300, + backend="ghostscript", + **kwargs, + ): + self.table_regions = table_regions + self.table_areas = table_areas + self.process_background = process_background + self.line_scale = line_scale + self.copy_text = copy_text + self.shift_text = shift_text + self.split_text = split_text + self.flag_size = flag_size + self.strip_text = strip_text + self.line_tol = line_tol + self.joint_tol = joint_tol + self.threshold_blocksize = threshold_blocksize + self.threshold_constant = threshold_constant + self.iterations = iterations + self.resolution = resolution + self.backend = Lattice._get_backend(backend) + + @staticmethod + def _get_backend(backend): + def implements_convert(): + methods = [ + method for method in dir(backend) if method.startswith("__") is False + ] + return "convert" in methods + + if isinstance(backend, str): + if backend not in BACKENDS.keys(): + raise NotImplementedError( + f"Unknown backend '{backend}' specified. Please use either 'poppler' or 'ghostscript'." + ) + + if backend == "ghostscript": + warnings.warn( + "'ghostscript' will be replaced by 'poppler' as the default image conversion" + " backend in v0.12.0. You can try out 'poppler' with backend='poppler'.", + DeprecationWarning, + ) + + return BACKENDS[backend]() + else: + if not implements_convert(): + raise NotImplementedError( + f"'{backend}' must implement a 'convert' method" + ) + + return backend + + @staticmethod + def _reduce_index(t, idx, shift_text): + """Reduces index of a text object if it lies within a spanning + cell. + + Parameters + ---------- + table : camelot.core.Table + idx : list + List of tuples of the form (r_idx, c_idx, text). + shift_text : list + {'l', 'r', 't', 'b'} + Select one or more strings from above and pass them as a + list to specify where the text in a spanning cell should + flow. + + Returns + ------- + indices : list + List of tuples of the form (r_idx, c_idx, text) where + r_idx and c_idx are new row and column indices for text. + + """ + indices = [] + for r_idx, c_idx, text in idx: + for d in shift_text: + if d == "l": + if t.cells[r_idx][c_idx].hspan: + while not t.cells[r_idx][c_idx].left: + c_idx -= 1 + if d == "r": + if t.cells[r_idx][c_idx].hspan: + while not t.cells[r_idx][c_idx].right: + c_idx += 1 + if d == "t": + if t.cells[r_idx][c_idx].vspan: + while not t.cells[r_idx][c_idx].top: + r_idx -= 1 + if d == "b": + if t.cells[r_idx][c_idx].vspan: + while not t.cells[r_idx][c_idx].bottom: + r_idx += 1 + indices.append((r_idx, c_idx, text)) + return indices + + @staticmethod + def _copy_spanning_text(t, copy_text=None): + """Copies over text in empty spanning cells. + + Parameters + ---------- + t : camelot.core.Table + copy_text : list, optional (default: None) + {'h', 'v'} + Select one or more strings from above and pass them as a list + to specify the direction in which text should be copied over + when a cell spans multiple rows or columns. + + Returns + ------- + t : camelot.core.Table + + """ + for f in copy_text: + if f == "h": + for i in range(len(t.cells)): + for j in range(len(t.cells[i])): + if t.cells[i][j].text.strip() == "": + if t.cells[i][j].hspan and not t.cells[i][j].left: + t.cells[i][j].text = t.cells[i][j - 1].text + elif f == "v": + for i in range(len(t.cells)): + for j in range(len(t.cells[i])): + if t.cells[i][j].text.strip() == "": + if t.cells[i][j].vspan and not t.cells[i][j].top: + t.cells[i][j].text = t.cells[i - 1][j].text + return t + + def _generate_table_bbox(self): + def scale_areas(areas): + scaled_areas = [] + for area in areas: + x1, y1, x2, y2 = area.split(",") + x1 = float(x1) + y1 = float(y1) + x2 = float(x2) + y2 = float(y2) + x1, y1, x2, y2 = scale_pdf((x1, y1, x2, y2), image_scalers) + scaled_areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1))) + return scaled_areas + + self.image, self.threshold = adaptive_threshold( + self.imagename, + process_background=self.process_background, + blocksize=self.threshold_blocksize, + c=self.threshold_constant, + ) + + image_width = self.image.shape[1] + image_height = self.image.shape[0] + image_width_scaler = image_width / float(self.pdf_width) + image_height_scaler = image_height / float(self.pdf_height) + pdf_width_scaler = self.pdf_width / float(image_width) + pdf_height_scaler = self.pdf_height / float(image_height) + image_scalers = (image_width_scaler, image_height_scaler, self.pdf_height) + pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height) + + if self.table_areas is None: + regions = None + if self.table_regions is not None: + regions = scale_areas(self.table_regions) + + vertical_mask, vertical_segments = find_lines( + self.threshold, + regions=regions, + direction="vertical", + line_scale=self.line_scale, + iterations=self.iterations, + ) + horizontal_mask, horizontal_segments = find_lines( + self.threshold, + regions=regions, + direction="horizontal", + line_scale=self.line_scale, + iterations=self.iterations, + ) + + self.image = correct_lines( + self.image, + vertical_segments, + horizontal_segments + ) + self.image, threshold = adaptive_threshold_with_img( + self.image, + process_background=self.process_background, + blocksize=self.threshold_blocksize, + c=self.threshold_constant + ) + + vertical_mask, vertical_segments = find_lines( + threshold, + regions=regions, + direction="vertical", + line_scale=self.line_scale, + iterations=self.iterations, + ) + horizontal_mask, horizontal_segments = find_lines( + threshold, + regions=regions, + direction="horizontal", + line_scale=self.line_scale, + iterations=self.iterations, + ) + + contours = find_contours(vertical_mask, horizontal_mask) + table_bbox = find_joints(contours, vertical_mask, horizontal_mask) + else: + vertical_mask, vertical_segments = find_lines( + self.threshold, + direction="vertical", + line_scale=self.line_scale, + iterations=self.iterations, + ) + horizontal_mask, horizontal_segments = find_lines( + self.threshold, + direction="horizontal", + line_scale=self.line_scale, + iterations=self.iterations, + ) + + areas = scale_areas(self.table_areas) + table_bbox = find_joints(areas, vertical_mask, horizontal_mask) + + self.table_bbox_unscaled = copy.deepcopy(table_bbox) + + self.table_bbox, self.vertical_segments, self.horizontal_segments = scale_image( + table_bbox, vertical_segments, horizontal_segments, pdf_scalers + ) + + def _generate_columns_and_rows(self, table_idx, tk): + # select elements which lie within table_bbox + t_bbox = {} + v_s, h_s = segments_in_bbox( + tk, self.vertical_segments, self.horizontal_segments + ) + t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text) + t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text) + + t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0)) + t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0)) + + self.t_bbox = t_bbox + + cols, rows = zip(*self.table_bbox[tk]) + cols, rows = list(cols), list(rows) + cols.extend([tk[0], tk[2]]) + rows.extend([tk[1], tk[3]]) + # sort horizontal and vertical segments + cols = merge_close_lines(sorted(cols), line_tol=self.line_tol) + rows = merge_close_lines(sorted(rows, reverse=True), line_tol=self.line_tol) + # make grid using x and y coord of shortlisted rows and cols + cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] + rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)] + + return cols, rows, v_s, h_s + + def _generate_table(self, table_idx, cols, rows, **kwargs): + v_s = kwargs.get("v_s") + h_s = kwargs.get("h_s") + if v_s is None or h_s is None: + raise ValueError("No segments found on {}".format(self.rootname)) + + table = Table(cols, rows) + # set table edges to True using ver+hor lines + table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol) + # set table border edges to True + table = table.set_border() + # set spanning cells to True + table = table.set_span() + + pos_errors = [] + # TODO: have a single list in place of two directional ones? + # sorted on x-coordinate based on reading order i.e. LTR or RTL + for direction in ["vertical", "horizontal"]: + for t in self.t_bbox[direction]: + indices, error = get_table_index( + table, + t, + direction, + split_text=self.split_text, + flag_size=self.flag_size, + strip_text=self.strip_text, + ) + if indices[:2] != (-1, -1): + pos_errors.append(error) + indices = Lattice._reduce_index( + table, indices, shift_text=self.shift_text + ) + for r_idx, c_idx, text in indices: + table.cells[r_idx][c_idx].text = text + accuracy = compute_accuracy([[100, pos_errors]]) + + if self.copy_text is not None: + table = Lattice._copy_spanning_text(table, copy_text=self.copy_text) + + data = table.data + table.df = pd.DataFrame(data) + table.shape = table.df.shape + + whitespace = compute_whitespace(data) + table.flavor = "lattice" + table.accuracy = accuracy + table.whitespace = whitespace + table.order = table_idx + 1 + table.page = int(os.path.basename(self.rootname).replace("page-", "")) + + # for plotting + _text = [] + _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text]) + _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text]) + table._text = _text + table._image = (self.image, self.table_bbox_unscaled) + table._segments = (self.vertical_segments, self.horizontal_segments) + table._textedges = None + + return table + + def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}): + self._generate_layout(filename, layout_kwargs) + if not suppress_stdout: + logger.info("Processing {}".format(os.path.basename(self.rootname))) + + if not self.horizontal_text: + if self.images: + warnings.warn( + "{} is image-based, camelot only works on" + " text-based pages.".format(os.path.basename(self.rootname)) + ) + else: + warnings.warn( + "No tables found on {}".format(os.path.basename(self.rootname)) + ) + return [] + + self.backend.convert(self.filename, self.imagename) + + self._generate_table_bbox() + + _tables = [] + # sort tables based on y-coord + for table_idx, tk in enumerate( + sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True) + ): + cols, rows, v_s, h_s = self._generate_columns_and_rows(table_idx, tk) + table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s) + table._bbox = tk + _tables.append(table) + + return _tables diff --git a/src/main/python/camelot/parsers/stream.py b/src/main/python/camelot/parsers/stream.py new file mode 100644 index 00000000..c7b21daf --- /dev/null +++ b/src/main/python/camelot/parsers/stream.py @@ -0,0 +1,468 @@ +# -*- coding: utf-8 -*- + +import os +import logging +import warnings + +import numpy as np +import pandas as pd + +from .base import BaseParser +from ..core import TextEdges, Table +from ..utils import text_in_bbox, get_table_index, compute_accuracy, compute_whitespace + + +logger = logging.getLogger("camelot") + + +class Stream(BaseParser): + """Stream method of parsing looks for spaces between text + to parse the table. + + If you want to specify columns when specifying multiple table + areas, make sure that the length of both lists are equal. + + Parameters + ---------- + table_regions : list, optional (default: None) + List of page regions that may contain tables of the form x1,y1,x2,y2 + where (x1, y1) -> left-top and (x2, y2) -> right-bottom + in PDF coordinate space. + table_areas : list, optional (default: None) + List of table area strings of the form x1,y1,x2,y2 + where (x1, y1) -> left-top and (x2, y2) -> right-bottom + in PDF coordinate space. + columns : list, optional (default: None) + List of column x-coordinates strings where the coordinates + are comma-separated. + split_text : bool, optional (default: False) + Split text that spans across multiple cells. + flag_size : bool, optional (default: False) + Flag text based on font size. Useful to detect + super/subscripts. Adds around flagged text. + strip_text : str, optional (default: '') + Characters that should be stripped from a string before + assigning it to a cell. + edge_tol : int, optional (default: 50) + Tolerance parameter for extending textedges vertically. + row_tol : int, optional (default: 2) + Tolerance parameter used to combine text vertically, + to generate rows. + column_tol : int, optional (default: 0) + Tolerance parameter used to combine text horizontally, + to generate columns. + + """ + + def __init__( + self, + table_regions=None, + table_areas=None, + columns=None, + split_text=False, + flag_size=False, + strip_text="", + edge_tol=50, + row_tol=2, + column_tol=0, + **kwargs, + ): + self.table_regions = table_regions + self.table_areas = table_areas + self.columns = columns + self._validate_columns() + self.split_text = split_text + self.flag_size = flag_size + self.strip_text = strip_text + self.edge_tol = edge_tol + self.row_tol = row_tol + self.column_tol = column_tol + + @staticmethod + def _text_bbox(t_bbox): + """Returns bounding box for the text present on a page. + + Parameters + ---------- + t_bbox : dict + Dict with two keys 'horizontal' and 'vertical' with lists of + LTTextLineHorizontals and LTTextLineVerticals respectively. + + Returns + ------- + text_bbox : tuple + Tuple (x0, y0, x1, y1) in pdf coordinate space. + + """ + xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]]) + ymin = min([t.y0 for direction in t_bbox for t in t_bbox[direction]]) + xmax = max([t.x1 for direction in t_bbox for t in t_bbox[direction]]) + ymax = max([t.y1 for direction in t_bbox for t in t_bbox[direction]]) + text_bbox = (xmin, ymin, xmax, ymax) + return text_bbox + + @staticmethod + def _group_rows(text, row_tol=2): + """Groups PDFMiner text objects into rows vertically + within a tolerance. + + Parameters + ---------- + text : list + List of PDFMiner text objects. + row_tol : int, optional (default: 2) + + Returns + ------- + rows : list + Two-dimensional list of text objects grouped into rows. + + """ + row_y = 0 + rows = [] + temp = [] + + for t in text: + # is checking for upright necessary? + # if t.get_text().strip() and all([obj.upright for obj in t._objs if + # type(obj) is LTChar]): + if t.get_text().strip(): + if not np.isclose(row_y, t.y0, atol=row_tol): + rows.append(sorted(temp, key=lambda t: t.x0)) + temp = [] + row_y = t.y0 + temp.append(t) + + rows.append(sorted(temp, key=lambda t: t.x0)) + if len(rows) > 1: + __ = rows.pop(0) # TODO: hacky + return rows + + @staticmethod + def _merge_columns(l, column_tol=0): + """Merges column boundaries horizontally if they overlap + or lie within a tolerance. + + Parameters + ---------- + l : list + List of column x-coordinate tuples. + column_tol : int, optional (default: 0) + + Returns + ------- + merged : list + List of merged column x-coordinate tuples. + + """ + merged = [] + for higher in l: + if not merged: + merged.append(higher) + else: + lower = merged[-1] + if column_tol >= 0: + if higher[0] <= lower[1] or np.isclose( + higher[0], lower[1], atol=column_tol + ): + upper_bound = max(lower[1], higher[1]) + lower_bound = min(lower[0], higher[0]) + merged[-1] = (lower_bound, upper_bound) + else: + merged.append(higher) + elif column_tol < 0: + if higher[0] <= lower[1]: + if np.isclose(higher[0], lower[1], atol=abs(column_tol)): + merged.append(higher) + else: + upper_bound = max(lower[1], higher[1]) + lower_bound = min(lower[0], higher[0]) + merged[-1] = (lower_bound, upper_bound) + else: + merged.append(higher) + return merged + + @staticmethod + def _join_rows(rows_grouped, text_y_max, text_y_min): + """Makes row coordinates continuous. + + Parameters + ---------- + rows_grouped : list + Two-dimensional list of text objects grouped into rows. + text_y_max : int + text_y_min : int + + Returns + ------- + rows : list + List of continuous row y-coordinate tuples. + + """ + row_mids = [ + sum([(t.y0 + t.y1) / 2 for t in r]) / len(r) if len(r) > 0 else 0 + for r in rows_grouped + ] + rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))] + rows.insert(0, text_y_max) + rows.append(text_y_min) + rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)] + return rows + + @staticmethod + def _add_columns(cols, text, row_tol): + """Adds columns to existing list by taking into account + the text that lies outside the current column x-coordinates. + + Parameters + ---------- + cols : list + List of column x-coordinate tuples. + text : list + List of PDFMiner text objects. + ytol : int + + Returns + ------- + cols : list + Updated list of column x-coordinate tuples. + + """ + if text: + text = Stream._group_rows(text, row_tol=row_tol) + elements = [len(r) for r in text] + new_cols = [ + (t.x0, t.x1) for r in text if len(r) == max(elements) for t in r + ] + cols.extend(Stream._merge_columns(sorted(new_cols))) + return cols + + @staticmethod + def _join_columns(cols, text_x_min, text_x_max): + """Makes column coordinates continuous. + + Parameters + ---------- + cols : list + List of column x-coordinate tuples. + text_x_min : int + text_y_max : int + + Returns + ------- + cols : list + Updated list of column x-coordinate tuples. + + """ + cols = sorted(cols) + cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))] + cols.insert(0, text_x_min) + cols.append(text_x_max) + cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] + return cols + + def _validate_columns(self): + if self.table_areas is not None and self.columns is not None: + if len(self.table_areas) != len(self.columns): + raise ValueError("Length of table_areas and columns" " should be equal") + + def _nurminen_table_detection(self, textlines): + """A general implementation of the table detection algorithm + described by Anssi Nurminen's master's thesis. + Link: https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3 + + Assumes that tables are situated relatively far apart + vertically. + """ + # TODO: add support for arabic text #141 + # sort textlines in reading order + textlines.sort(key=lambda x: (-x.y0, x.x0)) + textedges = TextEdges(edge_tol=self.edge_tol) + # generate left, middle and right textedges + textedges.generate(textlines) + # select relevant edges + relevant_textedges = textedges.get_relevant() + self.textedges.extend(relevant_textedges) + # guess table areas using textlines and relevant edges + table_bbox = textedges.get_table_areas(textlines, relevant_textedges) + # treat whole page as table area if no table areas found + if not len(table_bbox): + table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None} + + return table_bbox + + def _generate_table_bbox(self): + self.textedges = [] + if self.table_areas is None: + hor_text = self.horizontal_text + if self.table_regions is not None: + # filter horizontal text + hor_text = [] + for region in self.table_regions: + x1, y1, x2, y2 = region.split(",") + x1 = float(x1) + y1 = float(y1) + x2 = float(x2) + y2 = float(y2) + region_text = text_in_bbox((x1, y2, x2, y1), self.horizontal_text) + hor_text.extend(region_text) + # find tables based on nurminen's detection algorithm + table_bbox = self._nurminen_table_detection(hor_text) + else: + table_bbox = {} + for area in self.table_areas: + x1, y1, x2, y2 = area.split(",") + x1 = float(x1) + y1 = float(y1) + x2 = float(x2) + y2 = float(y2) + table_bbox[(x1, y2, x2, y1)] = None + self.table_bbox = table_bbox + + def _generate_columns_and_rows(self, table_idx, tk): + # select elements which lie within table_bbox + t_bbox = {} + t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text) + t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text) + + t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0)) + t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0)) + + self.t_bbox = t_bbox + + text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(self.t_bbox) + rows_grouped = self._group_rows(self.t_bbox["horizontal"], row_tol=self.row_tol) + rows = self._join_rows(rows_grouped, text_y_max, text_y_min) + elements = [len(r) for r in rows_grouped] + + if self.columns is not None and self.columns[table_idx] != "": + # user has to input boundary columns too + # take (0, pdf_width) by default + # similar to else condition + # len can't be 1 + cols = self.columns[table_idx].split(",") + cols = [float(c) for c in cols] + cols.insert(0, text_x_min) + cols.append(text_x_max) + cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] + else: + # calculate mode of the list of number of elements in + # each row to guess the number of columns + if not len(elements): + cols = [(text_x_min, text_x_max)] + else: + ncols = max(set(elements), key=elements.count) + if ncols == 1: + # if mode is 1, the page usually contains not tables + # but there can be cases where the list can be skewed, + # try to remove all 1s from list in this case and + # see if the list contains elements, if yes, then use + # the mode after removing 1s + elements = list(filter(lambda x: x != 1, elements)) + if len(elements): + ncols = max(set(elements), key=elements.count) + else: + warnings.warn(f"No tables found in table area {table_idx + 1}") + cols = [ + (t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r + ] + cols = self._merge_columns(sorted(cols), column_tol=self.column_tol) + inner_text = [] + for i in range(1, len(cols)): + left = cols[i - 1][1] + right = cols[i][0] + inner_text.extend( + [ + t + for direction in self.t_bbox + for t in self.t_bbox[direction] + if t.x0 > left and t.x1 < right + ] + ) + outer_text = [ + t + for direction in self.t_bbox + for t in self.t_bbox[direction] + if t.x0 > cols[-1][1] or t.x1 < cols[0][0] + ] + inner_text.extend(outer_text) + cols = self._add_columns(cols, inner_text, self.row_tol) + cols = self._join_columns(cols, text_x_min, text_x_max) + + return cols, rows + + def _generate_table(self, table_idx, cols, rows, **kwargs): + table = Table(cols, rows) + table = table.set_all_edges() + + pos_errors = [] + # TODO: have a single list in place of two directional ones? + # sorted on x-coordinate based on reading order i.e. LTR or RTL + for direction in ["vertical", "horizontal"]: + for t in self.t_bbox[direction]: + indices, error = get_table_index( + table, + t, + direction, + split_text=self.split_text, + flag_size=self.flag_size, + strip_text=self.strip_text, + ) + if indices[:2] != (-1, -1): + pos_errors.append(error) + for r_idx, c_idx, text in indices: + table.cells[r_idx][c_idx].text = text + accuracy = compute_accuracy([[100, pos_errors]]) + + data = table.data + table.df = pd.DataFrame(data) + table.shape = table.df.shape + + whitespace = compute_whitespace(data) + table.flavor = "stream" + table.accuracy = accuracy + table.whitespace = whitespace + table.order = table_idx + 1 + table.page = int(os.path.basename(self.rootname).replace("page-", "")) + + # for plotting + _text = [] + _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text]) + _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text]) + table._text = _text + table._image = None + table._segments = None + table._textedges = self.textedges + + return table + + def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}): + self._generate_layout(filename, layout_kwargs) + base_filename = os.path.basename(self.rootname) + + if not suppress_stdout: + logger.info(f"Processing {base_filename}") + + if not self.horizontal_text: + if self.images: + warnings.warn( + f"{base_filename} is image-based, camelot only works on" + " text-based pages." + ) + else: + warnings.warn(f"No tables found on {base_filename}") + return [] + + self._generate_table_bbox() + + _tables = [] + # sort tables based on y-coord + for table_idx, tk in enumerate( + sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True) + ): + cols, rows = self._generate_columns_and_rows(table_idx, tk) + table = self._generate_table(table_idx, cols, rows) + table._bbox = tk + _tables.append(table) + + return _tables diff --git a/src/main/python/camelot/plotting.py b/src/main/python/camelot/plotting.py new file mode 100644 index 00000000..f5b6afe9 --- /dev/null +++ b/src/main/python/camelot/plotting.py @@ -0,0 +1,225 @@ +# -*- coding: utf-8 -*- + +try: + import matplotlib.pyplot as plt + import matplotlib.patches as patches +except ImportError: + _HAS_MPL = False +else: + _HAS_MPL = True + + +class PlotMethods(object): + def __call__(self, table, kind="text", filename=None): + """Plot elements found on PDF page based on kind + specified, useful for debugging and playing with different + parameters to get the best output. + + Parameters + ---------- + table: camelot.core.Table + A Camelot Table. + kind : str, optional (default: 'text') + {'text', 'grid', 'contour', 'joint', 'line'} + The element type for which a plot should be generated. + filepath: str, optional (default: None) + Absolute path for saving the generated plot. + + Returns + ------- + fig : matplotlib.fig.Figure + + """ + if not _HAS_MPL: + raise ImportError("matplotlib is required for plotting.") + + if table.flavor == "lattice" and kind in ["textedge"]: + raise NotImplementedError(f"Lattice flavor does not support kind='{kind}'") + elif table.flavor == "stream" and kind in ["joint", "line"]: + raise NotImplementedError(f"Stream flavor does not support kind='{kind}'") + + plot_method = getattr(self, kind) + fig = plot_method(table) + + if filename is not None: + fig.savefig(filename) + return None + + return fig + + def text(self, table): + """Generates a plot for all text elements present + on the PDF page. + + Parameters + ---------- + table : camelot.core.Table + + Returns + ------- + fig : matplotlib.fig.Figure + + """ + fig = plt.figure() + ax = fig.add_subplot(111, aspect="equal") + xs, ys = [], [] + for t in table._text: + xs.extend([t[0], t[2]]) + ys.extend([t[1], t[3]]) + ax.add_patch(patches.Rectangle((t[0], t[1]), t[2] - t[0], t[3] - t[1])) + ax.set_xlim(min(xs) - 10, max(xs) + 10) + ax.set_ylim(min(ys) - 10, max(ys) + 10) + return fig + + def grid(self, table): + """Generates a plot for the detected table grids + on the PDF page. + + Parameters + ---------- + table : camelot.core.Table + + Returns + ------- + fig : matplotlib.fig.Figure + + """ + fig = plt.figure() + ax = fig.add_subplot(111, aspect="equal") + for row in table.cells: + for cell in row: + if cell.left: + ax.plot([cell.lb[0], cell.lt[0]], [cell.lb[1], cell.lt[1]]) + if cell.right: + ax.plot([cell.rb[0], cell.rt[0]], [cell.rb[1], cell.rt[1]]) + if cell.top: + ax.plot([cell.lt[0], cell.rt[0]], [cell.lt[1], cell.rt[1]]) + if cell.bottom: + ax.plot([cell.lb[0], cell.rb[0]], [cell.lb[1], cell.rb[1]]) + return fig + + def contour(self, table): + """Generates a plot for all table boundaries present + on the PDF page. + + Parameters + ---------- + table : camelot.core.Table + + Returns + ------- + fig : matplotlib.fig.Figure + + """ + try: + img, table_bbox = table._image + _FOR_LATTICE = True + except TypeError: + img, table_bbox = (None, {table._bbox: None}) + _FOR_LATTICE = False + fig = plt.figure() + ax = fig.add_subplot(111, aspect="equal") + + xs, ys = [], [] + if not _FOR_LATTICE: + for t in table._text: + xs.extend([t[0], t[2]]) + ys.extend([t[1], t[3]]) + ax.add_patch( + patches.Rectangle( + (t[0], t[1]), t[2] - t[0], t[3] - t[1], color="blue" + ) + ) + + for t in table_bbox.keys(): + ax.add_patch( + patches.Rectangle( + (t[0], t[1]), t[2] - t[0], t[3] - t[1], fill=False, color="red" + ) + ) + if not _FOR_LATTICE: + xs.extend([t[0], t[2]]) + ys.extend([t[1], t[3]]) + ax.set_xlim(min(xs) - 10, max(xs) + 10) + ax.set_ylim(min(ys) - 10, max(ys) + 10) + + if _FOR_LATTICE: + ax.imshow(img) + return fig + + def textedge(self, table): + """Generates a plot for relevant textedges. + + Parameters + ---------- + table : camelot.core.Table + + Returns + ------- + fig : matplotlib.fig.Figure + + """ + fig = plt.figure() + ax = fig.add_subplot(111, aspect="equal") + xs, ys = [], [] + for t in table._text: + xs.extend([t[0], t[2]]) + ys.extend([t[1], t[3]]) + ax.add_patch( + patches.Rectangle((t[0], t[1]), t[2] - t[0], t[3] - t[1], color="blue") + ) + ax.set_xlim(min(xs) - 10, max(xs) + 10) + ax.set_ylim(min(ys) - 10, max(ys) + 10) + + for te in table._textedges: + ax.plot([te.x, te.x], [te.y0, te.y1]) + + return fig + + def joint(self, table): + """Generates a plot for all line intersections present + on the PDF page. + + Parameters + ---------- + table : camelot.core.Table + + Returns + ------- + fig : matplotlib.fig.Figure + + """ + img, table_bbox = table._image + fig = plt.figure() + ax = fig.add_subplot(111, aspect="equal") + x_coord = [] + y_coord = [] + for k in table_bbox.keys(): + for coord in table_bbox[k]: + x_coord.append(coord[0]) + y_coord.append(coord[1]) + ax.plot(x_coord, y_coord, "ro") + ax.imshow(img) + return fig + + def line(self, table): + """Generates a plot for all line segments present + on the PDF page. + + Parameters + ---------- + table : camelot.core.Table + + Returns + ------- + fig : matplotlib.fig.Figure + + """ + fig = plt.figure() + ax = fig.add_subplot(111, aspect="equal") + vertical, horizontal = table._segments + for v in vertical: + ax.plot([v[0], v[2]], [v[1], v[3]]) + for h in horizontal: + ax.plot([h[0], h[2]], [h[1], h[3]]) + return fig diff --git a/src/main/python/camelot/utils.py b/src/main/python/camelot/utils.py new file mode 100644 index 00000000..404c00b2 --- /dev/null +++ b/src/main/python/camelot/utils.py @@ -0,0 +1,938 @@ +# -*- coding: utf-8 -*- + +import os +import re +import random +import shutil +import string +import tempfile +import warnings +from itertools import groupby +from operator import itemgetter + +import numpy as np +from pdfminer.pdfparser import PDFParser +from pdfminer.pdfdocument import PDFDocument +from pdfminer.pdfpage import PDFPage +from pdfminer.pdfpage import PDFTextExtractionNotAllowed +from pdfminer.pdfinterp import PDFResourceManager +from pdfminer.pdfinterp import PDFPageInterpreter +from pdfminer.converter import PDFPageAggregator +from pdfminer.layout import ( + LAParams, + LTAnno, + LTChar, + LTTextLineHorizontal, + LTTextLineVertical, + LTImage, +) + +from urllib.request import Request, urlopen +from urllib.parse import urlparse as parse_url +from urllib.parse import uses_relative, uses_netloc, uses_params + + +_VALID_URLS = set(uses_relative + uses_netloc + uses_params) +_VALID_URLS.discard("") + + +# https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py +def is_url(url): + """Check to see if a URL has a valid protocol. + + Parameters + ---------- + url : str or unicode + + Returns + ------- + isurl : bool + If url has a valid protocol return True otherwise False. + + """ + try: + return parse_url(url).scheme in _VALID_URLS + except Exception: + return False + + +def random_string(length): + ret = "" + while length: + ret += random.choice( + string.digits + string.ascii_lowercase + string.ascii_uppercase + ) + length -= 1 + return ret + + +def download_url(url): + """Download file from specified URL. + + Parameters + ---------- + url : str or unicode + + Returns + ------- + filepath : str or unicode + Temporary filepath. + + """ + filename = f"{random_string(6)}.pdf" + with tempfile.NamedTemporaryFile("wb", delete=False) as f: + headers = {"User-Agent": "Mozilla/5.0"} + request = Request(url, None, headers) + obj = urlopen(request) + content_type = obj.info().get_content_type() + if content_type != "application/pdf": + raise NotImplementedError("File format not supported") + f.write(obj.read()) + filepath = os.path.join(os.path.dirname(f.name), filename) + shutil.move(f.name, filepath) + return filepath + + +stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"] +lattice_kwargs = [ + "process_background", + "line_scale", + "copy_text", + "shift_text", + "line_tol", + "joint_tol", + "threshold_blocksize", + "threshold_constant", + "iterations", + "resolution", +] + + +def validate_input(kwargs, flavor="lattice"): + def check_intersection(parser_kwargs, input_kwargs): + isec = set(parser_kwargs).intersection(set(input_kwargs.keys())) + if isec: + raise ValueError( + f"{','.join(sorted(isec))} cannot be used with flavor='{flavor}'" + ) + + if flavor == "lattice": + check_intersection(stream_kwargs, kwargs) + else: + check_intersection(lattice_kwargs, kwargs) + + +def remove_extra(kwargs, flavor="lattice"): + if flavor == "lattice": + for key in kwargs.keys(): + if key in stream_kwargs: + kwargs.pop(key) + else: + for key in kwargs.keys(): + if key in lattice_kwargs: + kwargs.pop(key) + return kwargs + + +# https://stackoverflow.com/a/22726782 +class TemporaryDirectory(object): + def __enter__(self): + self.name = tempfile.mkdtemp() + return self.name + + def __exit__(self, exc_type, exc_value, traceback): + shutil.rmtree(self.name) + + +def translate(x1, x2): + """Translates x2 by x1. + + Parameters + ---------- + x1 : float + x2 : float + + Returns + ------- + x2 : float + + """ + x2 += x1 + return x2 + + +def scale(x, s): + """Scales x by scaling factor s. + + Parameters + ---------- + x : float + s : float + + Returns + ------- + x : float + + """ + x *= s + return x + + +def scale_pdf(k, factors): + """Translates and scales pdf coordinate space to image + coordinate space. + + Parameters + ---------- + k : tuple + Tuple (x1, y1, x2, y2) representing table bounding box where + (x1, y1) -> lt and (x2, y2) -> rb in PDFMiner coordinate + space. + factors : tuple + Tuple (scaling_factor_x, scaling_factor_y, pdf_y) where the + first two elements are scaling factors and pdf_y is height of + pdf. + + Returns + ------- + knew : tuple + Tuple (x1, y1, x2, y2) representing table bounding box where + (x1, y1) -> lt and (x2, y2) -> rb in OpenCV coordinate + space. + + """ + x1, y1, x2, y2 = k + scaling_factor_x, scaling_factor_y, pdf_y = factors + x1 = scale(x1, scaling_factor_x) + y1 = scale(abs(translate(-pdf_y, y1)), scaling_factor_y) + x2 = scale(x2, scaling_factor_x) + y2 = scale(abs(translate(-pdf_y, y2)), scaling_factor_y) + knew = (int(x1), int(y1), int(x2), int(y2)) + return knew + + +def scale_image(tables, v_segments, h_segments, factors): + """Translates and scales image coordinate space to pdf + coordinate space. + + Parameters + ---------- + tables : dict + Dict with table boundaries as keys and list of intersections + in that boundary as value. + v_segments : list + List of vertical line segments. + h_segments : list + List of horizontal line segments. + factors : tuple + Tuple (scaling_factor_x, scaling_factor_y, img_y) where the + first two elements are scaling factors and img_y is height of + image. + + Returns + ------- + tables_new : dict + v_segments_new : dict + h_segments_new : dict + + """ + scaling_factor_x, scaling_factor_y, img_y = factors + tables_new = {} + for k in tables.keys(): + x1, y1, x2, y2 = k + x1 = scale(x1, scaling_factor_x) + y1 = scale(abs(translate(-img_y, y1)), scaling_factor_y) + x2 = scale(x2, scaling_factor_x) + y2 = scale(abs(translate(-img_y, y2)), scaling_factor_y) + j_x, j_y = zip(*tables[k]) + j_x = [scale(j, scaling_factor_x) for j in j_x] + j_y = [scale(abs(translate(-img_y, j)), scaling_factor_y) for j in j_y] + joints = zip(j_x, j_y) + tables_new[(x1, y1, x2, y2)] = joints + + v_segments_new = [] + for v in v_segments: + x1, x2 = scale(v[0], scaling_factor_x), scale(v[2], scaling_factor_x) + y1, y2 = ( + scale(abs(translate(-img_y, v[1])), scaling_factor_y), + scale(abs(translate(-img_y, v[3])), scaling_factor_y), + ) + v_segments_new.append((x1, y1, x2, y2)) + + h_segments_new = [] + for h in h_segments: + x1, x2 = scale(h[0], scaling_factor_x), scale(h[2], scaling_factor_x) + y1, y2 = ( + scale(abs(translate(-img_y, h[1])), scaling_factor_y), + scale(abs(translate(-img_y, h[3])), scaling_factor_y), + ) + h_segments_new.append((x1, y1, x2, y2)) + + return tables_new, v_segments_new, h_segments_new + + +def get_rotation(chars, horizontal_text, vertical_text): + """Detects if text in table is rotated or not using the current + transformation matrix (CTM) and returns its orientation. + + Parameters + ---------- + horizontal_text : list + List of PDFMiner LTTextLineHorizontal objects. + vertical_text : list + List of PDFMiner LTTextLineVertical objects. + ltchar : list + List of PDFMiner LTChar objects. + + Returns + ------- + rotation : string + '' if text in table is upright, 'anticlockwise' if + rotated 90 degree anticlockwise and 'clockwise' if + rotated 90 degree clockwise. + + """ + rotation = "" + hlen = len([t for t in horizontal_text if t.get_text().strip()]) + vlen = len([t for t in vertical_text if t.get_text().strip()]) + if hlen < vlen: + clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in chars) + anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in chars) + rotation = "anticlockwise" if clockwise < anticlockwise else "clockwise" + return rotation + + +def segments_in_bbox(bbox, v_segments, h_segments): + """Returns all line segments present inside a bounding box. + + Parameters + ---------- + bbox : tuple + Tuple (x1, y1, x2, y2) representing a bounding box where + (x1, y1) -> lb and (x2, y2) -> rt in PDFMiner coordinate + space. + v_segments : list + List of vertical line segments. + h_segments : list + List of vertical horizontal segments. + + Returns + ------- + v_s : list + List of vertical line segments that lie inside table. + h_s : list + List of horizontal line segments that lie inside table. + + """ + lb = (bbox[0], bbox[1]) + rt = (bbox[2], bbox[3]) + v_s = [ + v + for v in v_segments + if v[1] > lb[1] - 2 and v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2 + ] + h_s = [ + h + for h in h_segments + if h[0] > lb[0] - 2 and h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2 + ] + return v_s, h_s + + +def text_in_bbox(bbox, text): + """Returns all text objects present inside a bounding box. + + Parameters + ---------- + bbox : tuple + Tuple (x1, y1, x2, y2) representing a bounding box where + (x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate + space. + text : List of PDFMiner text objects. + + Returns + ------- + t_bbox : list + List of PDFMiner text objects that lie inside table, discarding the overlapping ones + + """ + lb = (bbox[0], bbox[1]) + rt = (bbox[2], bbox[3]) + t_bbox = [ + t + for t in text + if lb[0] - 2 <= (t.x0 + t.x1) / 2.0 <= rt[0] + 2 + and lb[1] - 2 <= (t.y0 + t.y1) / 2.0 <= rt[1] + 2 + ] + + # Avoid duplicate text by discarding overlapping boxes + rest = {t for t in t_bbox} + for ba in t_bbox: + for bb in rest.copy(): + if ba == bb: + continue + if bbox_intersect(ba, bb): + # if the intersection is larger than 80% of ba's size, we keep the longest + if (bbox_intersection_area(ba, bb) / bbox_area(ba)) > 0.8: + if bbox_longer(bb, ba): + rest.discard(ba) + unique_boxes = list(rest) + + return unique_boxes + + +def bbox_intersection_area(ba, bb) -> float: + """Returns area of the intersection of the bounding boxes of two PDFMiner objects. + + Parameters + ---------- + ba : PDFMiner text object + bb : PDFMiner text object + + Returns + ------- + intersection_area : float + Area of the intersection of the bounding boxes of both objects + + """ + x_left = max(ba.x0, bb.x0) + y_top = min(ba.y1, bb.y1) + x_right = min(ba.x1, bb.x1) + y_bottom = max(ba.y0, bb.y0) + + if x_right < x_left or y_bottom > y_top: + return 0.0 + + intersection_area = (x_right - x_left) * (y_top - y_bottom) + return intersection_area + + +def bbox_area(bb) -> float: + """Returns area of the bounding box of a PDFMiner object. + + Parameters + ---------- + bb : PDFMiner text object + + Returns + ------- + area : float + Area of the bounding box of the object + + """ + return (bb.x1 - bb.x0) * (bb.y1 - bb.y0) + + +def bbox_intersect(ba, bb) -> bool: + """Returns True if the bounding boxes of two PDFMiner objects intersect. + + Parameters + ---------- + ba : PDFMiner text object + bb : PDFMiner text object + + Returns + ------- + overlaps : bool + True if the bounding boxes intersect + + """ + return ba.x1 >= bb.x0 and bb.x1 >= ba.x0 and ba.y1 >= bb.y0 and bb.y1 >= ba.y0 + + +def bbox_longer(ba, bb) -> bool: + """Returns True if the bounding box of the first PDFMiner object is longer or equal to the second. + + Parameters + ---------- + ba : PDFMiner text object + bb : PDFMiner text object + + Returns + ------- + longer : bool + True if the bounding box of the first object is longer or equal + + """ + return (ba.x1 - ba.x0) >= (bb.x1 - bb.x0) + + +def merge_close_lines(ar, line_tol=2): + """Merges lines which are within a tolerance by calculating a + moving mean, based on their x or y axis projections. + + Parameters + ---------- + ar : list + line_tol : int, optional (default: 2) + + Returns + ------- + ret : list + + """ + ret = [] + for a in ar: + if not ret: + ret.append(a) + else: + temp = ret[-1] + if np.isclose(temp, a, atol=line_tol): + temp = (temp + a) / 2.0 + ret[-1] = temp + else: + ret.append(a) + return ret + + +def text_strip(text, strip=""): + """Strips any characters in `strip` that are present in `text`. + Parameters + ---------- + text : str + Text to process and strip. + strip : str, optional (default: '') + Characters that should be stripped from `text`. + Returns + ------- + stripped : str + """ + if not strip: + return text + + stripped = re.sub( + fr"[{''.join(map(re.escape, strip))}]", "", text, flags=re.UNICODE + ) + return stripped + + +# TODO: combine the following functions into a TextProcessor class which +# applies corresponding transformations sequentially +# (inspired from sklearn.pipeline.Pipeline) + + +def flag_font_size(textline, direction, strip_text=""): + """Flags super/subscripts in text by enclosing them with . + May give false positives. + + Parameters + ---------- + textline : list + List of PDFMiner LTChar objects. + direction : string + Direction of the PDFMiner LTTextLine object. + strip_text : str, optional (default: '') + Characters that should be stripped from a string before + assigning it to a cell. + + Returns + ------- + fstring : string + + """ + if direction == "horizontal": + d = [ + (t.get_text(), np.round(t.height, decimals=6)) + for t in textline + if not isinstance(t, LTAnno) + ] + elif direction == "vertical": + d = [ + (t.get_text(), np.round(t.width, decimals=6)) + for t in textline + if not isinstance(t, LTAnno) + ] + l = [np.round(size, decimals=6) for text, size in d] + if len(set(l)) > 1: + flist = [] + min_size = min(l) + for key, chars in groupby(d, itemgetter(1)): + if key == min_size: + fchars = [t[0] for t in chars] + if "".join(fchars).strip(): + fchars.insert(0, "") + fchars.append("") + flist.append("".join(fchars)) + else: + fchars = [t[0] for t in chars] + if "".join(fchars).strip(): + flist.append("".join(fchars)) + fstring = "".join(flist) + else: + fstring = "".join([t.get_text() for t in textline]) + return text_strip(fstring, strip_text) + + +def split_textline(table, textline, direction, flag_size=False, strip_text=""): + """Splits PDFMiner LTTextLine into substrings if it spans across + multiple rows/columns. + + Parameters + ---------- + table : camelot.core.Table + textline : object + PDFMiner LTTextLine object. + direction : string + Direction of the PDFMiner LTTextLine object. + flag_size : bool, optional (default: False) + Whether or not to highlight a substring using + if its size is different from rest of the string. (Useful for + super and subscripts.) + strip_text : str, optional (default: '') + Characters that should be stripped from a string before + assigning it to a cell. + + Returns + ------- + grouped_chars : list + List of tuples of the form (idx, text) where idx is the index + of row/column and text is the an lttextline substring. + + """ + idx = 0 + cut_text = [] + bbox = textline.bbox + try: + if direction == "horizontal" and not textline.is_empty(): + x_overlap = [ + i + for i, x in enumerate(table.cols) + if x[0] <= bbox[2] and bbox[0] <= x[1] + ] + r_idx = [ + j + for j, r in enumerate(table.rows) + if r[1] <= (bbox[1] + bbox[3]) / 2 <= r[0] + ] + r = r_idx[0] + x_cuts = [ + (c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right + ] + if not x_cuts: + x_cuts = [(x_overlap[0], table.cells[r][-1].x2)] + for obj in textline._objs: + row = table.rows[r] + for cut in x_cuts: + if isinstance(obj, LTChar): + if ( + row[1] <= (obj.y0 + obj.y1) / 2 <= row[0] + and (obj.x0 + obj.x1) / 2 <= cut[1] + ): + cut_text.append((r, cut[0], obj)) + break + else: + # TODO: add test + if cut == x_cuts[-1]: + cut_text.append((r, cut[0] + 1, obj)) + elif isinstance(obj, LTAnno): + cut_text.append((r, cut[0], obj)) + elif direction == "vertical" and not textline.is_empty(): + y_overlap = [ + j + for j, y in enumerate(table.rows) + if y[1] <= bbox[3] and bbox[1] <= y[0] + ] + c_idx = [ + i + for i, c in enumerate(table.cols) + if c[0] <= (bbox[0] + bbox[2]) / 2 <= c[1] + ] + c = c_idx[0] + y_cuts = [ + (r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom + ] + if not y_cuts: + y_cuts = [(y_overlap[0], table.cells[-1][c].y1)] + for obj in textline._objs: + col = table.cols[c] + for cut in y_cuts: + if isinstance(obj, LTChar): + if ( + col[0] <= (obj.x0 + obj.x1) / 2 <= col[1] + and (obj.y0 + obj.y1) / 2 >= cut[1] + ): + cut_text.append((cut[0], c, obj)) + break + else: + # TODO: add test + if cut == y_cuts[-1]: + cut_text.append((cut[0] - 1, c, obj)) + elif isinstance(obj, LTAnno): + cut_text.append((cut[0], c, obj)) + except IndexError: + return [(-1, -1, textline.get_text())] + grouped_chars = [] + for key, chars in groupby(cut_text, itemgetter(0, 1)): + if flag_size: + grouped_chars.append( + ( + key[0], + key[1], + flag_font_size( + [t[2] for t in chars], direction, strip_text=strip_text + ), + ) + ) + else: + gchars = [t[2].get_text() for t in chars] + grouped_chars.append( + (key[0], key[1], text_strip("".join(gchars), strip_text)) + ) + return grouped_chars + + +def get_table_index( + table, t, direction, split_text=False, flag_size=False, strip_text="" +): + """Gets indices of the table cell where given text object lies by + comparing their y and x-coordinates. + + Parameters + ---------- + table : camelot.core.Table + t : object + PDFMiner LTTextLine object. + direction : string + Direction of the PDFMiner LTTextLine object. + split_text : bool, optional (default: False) + Whether or not to split a text line if it spans across + multiple cells. + flag_size : bool, optional (default: False) + Whether or not to highlight a substring using + if its size is different from rest of the string. (Useful for + super and subscripts) + strip_text : str, optional (default: '') + Characters that should be stripped from a string before + assigning it to a cell. + + Returns + ------- + indices : list + List of tuples of the form (r_idx, c_idx, text) where r_idx + and c_idx are row and column indices. + error : float + Assignment error, percentage of text area that lies outside + a cell. + +-------+ + | | + | [Text bounding box] + | | + +-------+ + + """ + r_idx, c_idx = [-1] * 2 + for r in range(len(table.rows)): + if (t.y0 + t.y1) / 2.0 < table.rows[r][0] and (t.y0 + t.y1) / 2.0 > table.rows[ + r + ][1]: + lt_col_overlap = [] + for c in table.cols: + if c[0] <= t.x1 and c[1] >= t.x0: + left = t.x0 if c[0] <= t.x0 else c[0] + right = t.x1 if c[1] >= t.x1 else c[1] + lt_col_overlap.append(abs(left - right) / abs(c[0] - c[1])) + else: + lt_col_overlap.append(-1) + if len(list(filter(lambda x: x != -1, lt_col_overlap))) == 0: + text = t.get_text().strip("\n") + text_range = (t.x0, t.x1) + col_range = (table.cols[0][0], table.cols[-1][1]) + warnings.warn( + f"{text} {text_range} does not lie in column range {col_range}" + ) + r_idx = r + c_idx = lt_col_overlap.index(max(lt_col_overlap)) + break + + # error calculation + y0_offset, y1_offset, x0_offset, x1_offset = [0] * 4 + if t.y0 > table.rows[r_idx][0]: + y0_offset = abs(t.y0 - table.rows[r_idx][0]) + if t.y1 < table.rows[r_idx][1]: + y1_offset = abs(t.y1 - table.rows[r_idx][1]) + if t.x0 < table.cols[c_idx][0]: + x0_offset = abs(t.x0 - table.cols[c_idx][0]) + if t.x1 > table.cols[c_idx][1]: + x1_offset = abs(t.x1 - table.cols[c_idx][1]) + X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1) + Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1) + charea = X * Y + error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea + + if split_text: + return ( + split_textline( + table, t, direction, flag_size=flag_size, strip_text=strip_text + ), + error, + ) + else: + if flag_size: + return ( + [ + ( + r_idx, + c_idx, + flag_font_size(t._objs, direction, strip_text=strip_text), + ) + ], + error, + ) + else: + return [(r_idx, c_idx, text_strip(t.get_text(), strip_text))], error + + +def compute_accuracy(error_weights): + """Calculates a score based on weights assigned to various + parameters and their error percentages. + + Parameters + ---------- + error_weights : list + Two-dimensional list of the form [[p1, e1], [p2, e2], ...] + where pn is the weight assigned to list of errors en. + Sum of pn should be equal to 100. + + Returns + ------- + score : float + + """ + SCORE_VAL = 100 + try: + score = 0 + if sum([ew[0] for ew in error_weights]) != SCORE_VAL: + raise ValueError("Sum of weights should be equal to 100.") + for ew in error_weights: + weight = ew[0] / len(ew[1]) + for error_percentage in ew[1]: + score += weight * (1 - error_percentage) + except ZeroDivisionError: + score = 0 + return score + + +def compute_whitespace(d): + """Calculates the percentage of empty strings in a + two-dimensional list. + + Parameters + ---------- + d : list + + Returns + ------- + whitespace : float + Percentage of empty cells. + + """ + whitespace = 0 + r_nempty_cells, c_nempty_cells = [], [] + for i in d: + for j in i: + if j.strip() == "": + whitespace += 1 + whitespace = 100 * (whitespace / float(len(d) * len(d[0]))) + return whitespace + + +def get_page_layout( + filename, + line_overlap=0.5, + char_margin=1.0, + line_margin=0.5, + word_margin=0.1, + boxes_flow=0.5, + detect_vertical=True, + all_texts=True, +): + """Returns a PDFMiner LTPage object and page dimension of a single + page pdf. To get the definitions of kwargs, see + https://pdfminersix.rtfd.io/en/latest/reference/composable.html. + + Parameters + ---------- + filename : string + Path to pdf file. + line_overlap : float + char_margin : float + line_margin : float + word_margin : float + boxes_flow : float + detect_vertical : bool + all_texts : bool + + Returns + ------- + layout : object + PDFMiner LTPage object. + dim : tuple + Dimension of pdf page in the form (width, height). + + """ + with open(filename, "rb") as f: + parser = PDFParser(f) + document = PDFDocument(parser) + if not document.is_extractable: + raise PDFTextExtractionNotAllowed( + f"Text extraction is not allowed: {filename}" + ) + laparams = LAParams( + line_overlap=line_overlap, + char_margin=char_margin, + line_margin=line_margin, + word_margin=word_margin, + boxes_flow=boxes_flow, + detect_vertical=detect_vertical, + all_texts=all_texts, + ) + rsrcmgr = PDFResourceManager() + device = PDFPageAggregator(rsrcmgr, laparams=laparams) + interpreter = PDFPageInterpreter(rsrcmgr, device) + for page in PDFPage.create_pages(document): + interpreter.process_page(page) + layout = device.get_result() + width = layout.bbox[2] + height = layout.bbox[3] + dim = (width, height) + return layout, dim + + +def get_text_objects(layout, ltype="char", t=None): + """Recursively parses pdf layout to get a list of + PDFMiner text objects. + + Parameters + ---------- + layout : object + PDFMiner LTPage object. + ltype : string + Specify 'char', 'lh', 'lv' to get LTChar, LTTextLineHorizontal, + and LTTextLineVertical objects respectively. + t : list + + Returns + ------- + t : list + List of PDFMiner text objects. + + """ + if ltype == "char": + LTObject = LTChar + elif ltype == "image": + LTObject = LTImage + elif ltype == "horizontal_text": + LTObject = LTTextLineHorizontal + elif ltype == "vertical_text": + LTObject = LTTextLineVertical + if t is None: + t = [] + try: + for obj in layout._objs: + if isinstance(obj, LTObject): + t.append(obj) + else: + t += get_text_objects(obj, ltype=ltype) + except AttributeError: + pass + return t diff --git a/src/test/kotlin/com/github/darderion/mundaneassignmentpolice/TestsConfiguration.kt b/src/test/kotlin/com/github/darderion/mundaneassignmentpolice/TestsConfiguration.kt index 3638eb72..4e832cc9 100644 --- a/src/test/kotlin/com/github/darderion/mundaneassignmentpolice/TestsConfiguration.kt +++ b/src/test/kotlin/com/github/darderion/mundaneassignmentpolice/TestsConfiguration.kt @@ -2,6 +2,6 @@ package com.github.darderion.mundaneassignmentpolice class TestsConfiguration { companion object { - const val resourceFolder = "src/test/resources/com/github/darderion/mundaneassignmentpolice/" + const val resourceFolder = "src/main/python/src/test/resources/com/github/darderion/mundaneassignmentpolice/" } } \ No newline at end of file diff --git a/src/test/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/PDFDocumentTests.kt b/src/test/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/PDFDocumentTests.kt index ad8c15bf..00d46e50 100644 --- a/src/test/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/PDFDocumentTests.kt +++ b/src/test/kotlin/com/github/darderion/mundaneassignmentpolice/pdfdocument/PDFDocumentTests.kt @@ -8,7 +8,7 @@ import com.github.darderion.mundaneassignmentpolice.wrapper.PDFBox import io.kotest.core.spec.style.StringSpec import io.kotest.inspectors.forAll import io.kotest.matchers.shouldBe - +/* class PDFDocumentTests: StringSpec({ "PDFDocument should contain TITLE_PAGE's lines" { PDFDocument(text = lines).text.any { it.area == TITLE_PAGE } shouldBe true @@ -152,3 +152,5 @@ class PDFDocumentTests: StringSpec({ ) } } + + */ diff --git a/src/test/python/TableExtractionScriptTest.py b/src/test/python/TableExtractionScriptTest.py new file mode 100644 index 00000000..83ffccfa --- /dev/null +++ b/src/test/python/TableExtractionScriptTest.py @@ -0,0 +1,60 @@ +import unittest +import pandas +import contextlib +from pathlib import Path +import io +import os +import sys +import src.main.python.camelot +from src.main.python.TableExtractionScript import extraction + +sys.path.insert(0, '../src') + +class TableExtractionScriptTest(unittest.TestCase): + + def test_open_file(self): + pdf_path = 'src/test//resources/com/github/darderion/mundaneassignmentpolice/python/tableextractionscript/OpenNotPDF.docx' + + s = io.StringIO() + with contextlib.redirect_stdout(s): + extraction(pdf_path) + + self.assertEqual('invalid PDF file\n', s.getvalue()) + + def test_check_table_directory(self): + pdf_path = 'src/test/resources/com/github/darderion/mundaneassignmentpolice/python/tableextractionscript/TableInformation.pdf' + extraction(pdf_path) + self.assertTrue(os.path.exists(f'uploads/tables/{Path(pdf_path).stem}')) + + def test_save_table(self): + pdf_path = 'src/test/resources/com/github/darderion/mundaneassignmentpolice/python/tableextractionscript/TableInformation.pdf' + extraction(pdf_path) + self.assertTrue(os.path.exists('uploads/tables/TableInformation/TableInformation-page-1-table-1.csv')) + + def test_check_table_information(self): + pdf_path = 'src/test/resources/com/github/darderion/mundaneassignmentpolice/python/tableextractionscript/TableInformation.pdf' + extraction(pdf_path) + table = pandas.read_csv(os.path.expanduser("~/map/uploads/tables/TableInformation/TableInformation-page-1-table-1.csv")) + camelot_table = src.main.python.camelot.read_pdf(pdf_path, linescale=30)[0] + self.assertEqual('table data', table.columns[0]) + + self.assertEqual('table information', table['table data'][4]) + + self.assertEqual('page', table['table data'][5]) + self.assertEqual('1', table['table data'][6]) + + self.assertEqual('table area', table['table data'][7]) + self.assertEqual(camelot_table.cells[3][0].x1, float(table['table data'][8])) + self.assertEqual(camelot_table.cells[3][3].x2, float(table['table data'][10])) + self.assertEqual(camelot_table.cells[3][0].y1, float(table['table data'][9])) + self.assertEqual(camelot_table.cells[0][3].y2, float(table['table data'][11])) + + self.assertEqual('rows', table['table data'][12]) + self.assertEqual('4', table['table data'][13]) + + self.assertEqual('columns', table['table data'][14]) + self.assertEqual('4', table['table data'][15]) + + +if __name__ == '__main__': + unittest.main() diff --git a/src/test/python/camelot/camelot_py.py b/src/test/python/camelot/camelot_py.py new file mode 100644 index 00000000..1424c35f --- /dev/null +++ b/src/test/python/camelot/camelot_py.py @@ -0,0 +1,109 @@ +import os +import unittest +import sys +sys.path.insert(0, '../src') +import src.main.python.camelot as camelot +from src.main.python.camelot.image_processing import ( + intersectes +) +os.chdir(os.path.expanduser("~/map/src/test/resources/com/github/darderion/mundaneassignmentpolice/python/camelot")) + + +class DrawingLines(unittest.TestCase): + def test_v_draw(self): + file_name = 'DrawingVerticalLines.pdf' + + tables = camelot.read_pdf(file_name, latice=True, pages='1') + self.assertEqual(0, len(tables)) + + tables = camelot.read_pdf(file_name, latice=True, pages='2') + self.assertEqual(0, len(tables)) + + tables = camelot.read_pdf(file_name, latice=True, pages='3') + self.assertEqual(1, len(tables)) + self.assertEqual(5, len(tables[0].cells)) + self.assertEqual(1, len(tables[0].cols)) + self.assertEqual(5, len(tables[0].rows)) + + tables = camelot.read_pdf(file_name, latice=True, pages='4') + self.assertEqual(3, len(tables)) + + self.assertEqual(2, len(tables[0].cells)) + self.assertEqual(2, len(tables[1].cells)) + self.assertEqual(2, len(tables[2].cells)) + + self.assertEqual(1, len(tables[0].cols)) + self.assertEqual(1, len(tables[1].cols)) + self.assertEqual(1, len(tables[2].cols)) + + self.assertEqual(2, len(tables[0].rows)) + self.assertEqual(2, len(tables[1].rows)) + self.assertEqual(2, len(tables[2].rows)) + + def test_h_draw(self): + file_name = 'DrawingHorizontalLines.pdf' + + tables = camelot.read_pdf(file_name, latice=True, pages='1') + self.assertEqual(0, len(tables)) + + tables = camelot.read_pdf(file_name, latice=True, pages='2') + self.assertEqual(0, len(tables)) + + tables = camelot.read_pdf(file_name, latice=True, pages='3') + self.assertEqual(1, len(tables)) + + tables = camelot.read_pdf(file_name, latice=True, pages='4') + self.assertEqual(1, len(tables)) + + tables = camelot.read_pdf(file_name, latice=True, pages='5') + self.assertEqual(2, len(tables)) + + def test_intersects(self): + # rib1 intersects rib2 at first end + rib1, rib2 = (1, 100, 1, 5), (1, 5, 100, 5) + self.assertEqual(True, intersectes(rib1, rib2)) + + # rib1 intersects rib2 at second end + rib1, rib2 = (1, 100, 100, 100), (100, 100, 100, 5) + self.assertEqual(True, intersectes(rib1, rib2)) + + # horizontal rib1 parallel to horizontal rib2 + rib1, rib2 = (1, 100, 5, 100), (1, 200, 5, 200) + self.assertEqual(False, intersectes(rib1, rib2)) + + # vertical rib1 parallel to vertical rib2 + rib1, rib2 = (1, 100, 1, 200), (10, 100, 10, 200) + self.assertEqual(False, intersectes(rib1, rib2)) + + # rib1 intersects rib2 inside + rib1, rib2 = (1, 5, 100, 5), (50, 100, 50, 2) + self.assertEqual(True, intersectes(rib1, rib2)) + + # rib1 does not intersect rib2 + rib1, rib2 = (5, 10, 100, 10), (50, 60, 50, 40) + self.assertEqual(False, intersectes(rib1, rib2)) + + # rib1 lies on the same line as rib2 and does not intersect rib2 + rib1, rib2 = (5, 10, 100, 10), (150, 10, 160, 10) + self.assertEqual(False, intersectes(rib1, rib2)) + + def test_correct_lines(self): + file_name = 'DrawingComplexTables.pdf' + + tables = camelot.read_pdf(file_name, latice=True, pages='1') + self.assertEqual(1, len(tables)) + + tables = camelot.read_pdf(file_name, latice=True, pages='2') + self.assertEqual(2, len(tables)) + + + tables = camelot.read_pdf(file_name, latice=True, pages='3') + self.assertEqual(2, len(tables)) + + tables = camelot.read_pdf(file_name, latice=True, pages='4') + self.assertEqual(3, len(tables)) + + + +if __name__ == '__main__': + unittest.main() diff --git a/src/test/resources/com/github/darderion/mundaneassignmentpolice/python/camelot/DrawingComplexTables.pdf b/src/test/resources/com/github/darderion/mundaneassignmentpolice/python/camelot/DrawingComplexTables.pdf new file mode 100644 index 00000000..9fce1ba4 Binary files /dev/null and b/src/test/resources/com/github/darderion/mundaneassignmentpolice/python/camelot/DrawingComplexTables.pdf differ diff --git a/src/test/resources/com/github/darderion/mundaneassignmentpolice/python/camelot/DrawingHorizontalLines.pdf b/src/test/resources/com/github/darderion/mundaneassignmentpolice/python/camelot/DrawingHorizontalLines.pdf new file mode 100644 index 00000000..2de1c1a2 Binary files /dev/null and b/src/test/resources/com/github/darderion/mundaneassignmentpolice/python/camelot/DrawingHorizontalLines.pdf differ diff --git a/src/test/resources/com/github/darderion/mundaneassignmentpolice/python/camelot/DrawingVerticalLines.pdf b/src/test/resources/com/github/darderion/mundaneassignmentpolice/python/camelot/DrawingVerticalLines.pdf new file mode 100644 index 00000000..c8526582 Binary files /dev/null and b/src/test/resources/com/github/darderion/mundaneassignmentpolice/python/camelot/DrawingVerticalLines.pdf differ diff --git a/src/test/resources/com/github/darderion/mundaneassignmentpolice/python/tableextractionscript/OpenNotPDF.docx b/src/test/resources/com/github/darderion/mundaneassignmentpolice/python/tableextractionscript/OpenNotPDF.docx new file mode 100644 index 00000000..f839a32b Binary files /dev/null and b/src/test/resources/com/github/darderion/mundaneassignmentpolice/python/tableextractionscript/OpenNotPDF.docx differ diff --git a/src/test/resources/com/github/darderion/mundaneassignmentpolice/python/tableextractionscript/TableInformation.pdf b/src/test/resources/com/github/darderion/mundaneassignmentpolice/python/tableextractionscript/TableInformation.pdf new file mode 100644 index 00000000..d19b3964 Binary files /dev/null and b/src/test/resources/com/github/darderion/mundaneassignmentpolice/python/tableextractionscript/TableInformation.pdf differ