Skip to content
Original file line number Diff line number Diff line change
@@ -1,18 +1,8 @@
package com.github.darderion.mundaneassignmentpolice.checker

import com.github.darderion.mundaneassignmentpolice.checker.rule.Rule
import com.github.darderion.mundaneassignmentpolice.checker.rule.list.ListRuleBuilder
import com.github.darderion.mundaneassignmentpolice.checker.rule.symbol.SymbolRule
import com.github.darderion.mundaneassignmentpolice.checker.rule.symbol.SymbolRuleBuilder
import com.github.darderion.mundaneassignmentpolice.checker.rule.symbol.and
import com.github.darderion.mundaneassignmentpolice.checker.rule.symbol.or
import com.github.darderion.mundaneassignmentpolice.checker.rule.tableofcontent.TableOfContentRuleBuilder
import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFArea.*
import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFRegion.Companion.EVERYWHERE
import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFRegion.Companion.NOWHERE
import com.github.darderion.mundaneassignmentpolice.rules.RuleSet
import com.github.darderion.mundaneassignmentpolice.wrapper.PDFBox
import java.util.*

class Checker {
fun getRuleViolations(pdfName: String, ruleSet: RuleSet) = getRuleViolations(pdfName, ruleSet.rules)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
package com.github.darderion.mundaneassignmentpolice.checker

enum class PunctuationMark(val value: Char) {
FULL_STOP('.'),
COMMA(',')
}

fun Char.isPunctuationMark() = PunctuationMark.values().map { it.value }.contains(this)

fun String.isPunctuationMark() = this.length == 1 && this.single().isPunctuationMark()
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
package com.github.darderion.mundaneassignmentpolice.checker.rule.formula

import com.github.darderion.mundaneassignmentpolice.checker.RuleViolation
import com.github.darderion.mundaneassignmentpolice.checker.RuleViolationType
import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFDocument
import com.github.darderion.mundaneassignmentpolice.pdfdocument.inside
import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.Formula
import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.Line
import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.Word

class FormulaPunctuationRule(
type: RuleViolationType,
name: String,
private val ignoredWords: List<Regex>,
private val ruleBody:
(formula: Formula, filteredText: List<Word>, nextFormula: Formula?) -> List<Line>
) : FormulaRule(type, name) {
override fun getViolations(document: PDFDocument, formulas: List<Formula>): List<RuleViolation> {
val violations = mutableListOf<RuleViolation>()

formulas.forEachIndexed { index, formula ->
val textAfterFormula = formula.lines.last().text
.takeLastWhile { it != formula.text.last() }
.toMutableList()

textAfterFormula.addAll(
document.text.asSequence().drop(formula.lines.last().documentIndex + 1)
.filter { it.area!! inside area && it.isNotEmpty() }
.take(2) // take a line with formula reference and a line with words after the formula
.map { it.text }.flatten()
)

val filteredText = textAfterFormula.filterNot { word -> ignoredWords.any { it.matches(word.text) } }

val nextFormula = formulas.getOrNull(index + 1)

val violationLines = ruleBody(formula, filteredText, nextFormula)
if (violationLines.isNotEmpty()) {
violations.add(
RuleViolation(violationLines, name, type)
)
}
}

return violations
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package com.github.darderion.mundaneassignmentpolice.checker.rule.formula

import com.github.darderion.mundaneassignmentpolice.checker.RuleViolationType
import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.Formula
import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.Line
import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.Word

class FormulaPunctuationRuleBuilder {
private var type: RuleViolationType = RuleViolationType.Error
private var name: String = "Rule name"
private var ignoredWords: MutableList<Regex> = mutableListOf()
private var ruleBody: (formula: Formula, filteredText: List<Word>, nextFormula: Formula?) -> List<Line> =
{ _, _, _ -> emptyList() }

infix fun called(name: String) = this.also { this.name = name }

infix fun type(type: RuleViolationType) = this.also { this.type = type }

fun ignoredWords(vararg regexes: Regex) = this.also { ignoredWords.addAll(regexes) }

infix fun rule(
ruleBody: (formula: Formula, filteredText: List<Word>, nextFormula: Formula?) -> List<Line>
) = this.also { this.ruleBody = ruleBody }

fun getRule() = FormulaPunctuationRule(
type,
name,
ignoredWords,
ruleBody
) as FormulaRule
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
package com.github.darderion.mundaneassignmentpolice.checker.rule.formula

import com.github.darderion.mundaneassignmentpolice.checker.RuleViolation
import com.github.darderion.mundaneassignmentpolice.checker.RuleViolationType
import com.github.darderion.mundaneassignmentpolice.checker.rule.Rule
import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFArea
import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFDocument
import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFRegion
import com.github.darderion.mundaneassignmentpolice.pdfdocument.inside
import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.Formula
import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.Line
import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.PostScriptFontType
import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.Word

abstract class FormulaRule(
type: RuleViolationType,
name: String
) : Rule(PDFRegion.NOWHERE.except(PDFArea.SECTION), name, type) {
abstract fun getViolations(document: PDFDocument, formulas: List<Formula>): List<RuleViolation>

override fun process(document: PDFDocument) =
getViolations(document, getAllFormulas(document))

private fun getAllFormulas(document: PDFDocument): List<Formula> {
val text = document.text.filter { it.area!! inside area && it.isNotEmpty() }

val formulas = mutableListOf<Formula>()
val formulaText = mutableListOf<Word>()
val formulaLines = mutableSetOf<Line>()

text.forEach { line ->
line.text.forEach { word ->
if (word.font.type == PostScriptFontType.TYPE2 || word.text == " " && formulaText.isNotEmpty()) {
// also captures some records in tables and code listings
formulaText.add(word)
formulaLines.add(line)
} else if (formulaText.isNotEmpty()) {
formulas.add(Formula(formulaText.dropLastWhile { it.text == " " }, formulaLines.toSet()))
formulaText.clear()
formulaLines.clear()
}
}
}
if (formulaText.isNotEmpty())
formulas.add(Formula(formulaText.dropLastWhile { it.text == " " }, formulaLines.toSet()))

return filterFormulas(formulas)
}

private fun filterFormulas(formulas: List<Formula>) =
formulas.filterNot {
it.text.size == 1 && it.text.first().text == "∗" // remove single special characters
}.filterNot {
it.text.size == 1 && it.text.first().text.toDoubleOrNull() != null // remove numbers
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package com.github.darderion.mundaneassignmentpolice.pdfdocument

import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.Line
import mu.KotlinLogging
import java.lang.Exception

class PDFDocument(val name: String = "PDF",
val text: List<Line>,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
package com.github.darderion.mundaneassignmentpolice.pdfdocument.list

import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.Coordinate
import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.Font
import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.Line
import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.Word
import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.*
import java.util.*

data class PDFList<T>(val value: MutableList<T> = mutableListOf(), val nodes: MutableList<PDFList<T>> = mutableListOf()) {
Expand Down Expand Up @@ -54,7 +51,7 @@ data class PDFList<T>(val value: MutableList<T> = mutableListOf(), val nodes: Mu
*/
fun getLists(lines: List<Line>): List<PDFList<Line>> {
// Adding a line to process a text that has no lines after a list
val lines = lines + Line(-1, -1, -1, listOf(Word("NOT A LIST ITEM", Font(0.0f), Coordinate(1000, -1))))
val lines = lines + Line(-1, -1, -1, listOf(Word("NOT A LIST ITEM", Font(), Coordinate(1000, -1))))

val lists: MutableList<PDFList<Line>> = mutableListOf()
val stack: Stack<PDFList<Line>> = Stack()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,15 @@ package com.github.darderion.mundaneassignmentpolice.pdfdocument.text

import com.github.darderion.mundaneassignmentpolice.utils.floatEquals

class Font(val size: Float) {
enum class PostScriptFontType {
TYPE0, TYPE1, TYPE2, TYPE3, NONE
}

class Font(val type: PostScriptFontType, val size: Float) {
constructor(): this(PostScriptFontType.NONE, 0.0f)

override fun equals(other: Any?) = this === other ||
(other is Font && floatEquals(size, other.size))
(other is Font && type == other.type && floatEquals(size, other.size))

override fun hashCode() = size.hashCode()
override fun hashCode() = (type to size).hashCode()
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
package com.github.darderion.mundaneassignmentpolice.pdfdocument.text

data class Formula(val text: List<Word>, val lines: Set<Line>)
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,8 @@ data class Line(val index: Int, val page: Int, val documentIndex: Int,
override fun toString() = "[$documentIndex -- $index, p.$page, $area, ${position.x}] --> '$content'"

fun drop(numberOfItems: Int) = Line(index, page, documentIndex, text.drop(numberOfItems), area)

fun isEmpty() = text.isEmpty() || text.size == 1 && text.first().text == ""

fun isNotEmpty() = !isEmpty()
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@ data class Word(val text: String, val font: Font, val position: Coordinate) {

companion object {
val spaceCharacter: Word
get() = Word(" ", Font(0.0f), Coordinate(0, 0))
get() = Word(" ", Font(), Coordinate(0, 0))
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package com.github.darderion.mundaneassignmentpolice.rules
import com.github.darderion.mundaneassignmentpolice.checker.rule.Rule

val RULE_SET_RU = RuleSet(
mutableListOf(
listOf(
RULE_LITLINK,
RULE_SHORT_DASH,
RULE_MEDIUM_DASH,
Expand All @@ -21,6 +21,7 @@ val RULE_SET_RU = RuleSet(
)
+ RULES_SPACE_AROUND_BRACKETS
+ RULES_SMALL_NUMBERS
+ RULES_FORMULA_PUNCTUATION
)

class RuleSet(val rules: List<Rule>) {}
class RuleSet(val rules: List<Rule>)
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
package com.github.darderion.mundaneassignmentpolice.rules

import com.github.darderion.mundaneassignmentpolice.checker.PunctuationMark
import com.github.darderion.mundaneassignmentpolice.checker.RuleViolationType
import com.github.darderion.mundaneassignmentpolice.checker.isPunctuationMark
import com.github.darderion.mundaneassignmentpolice.checker.rule.formula.FormulaPunctuationRuleBuilder
import com.github.darderion.mundaneassignmentpolice.checker.rule.list.ListRuleBuilder
import com.github.darderion.mundaneassignmentpolice.checker.rule.symbol.SymbolRule
import com.github.darderion.mundaneassignmentpolice.checker.rule.regex.RegexRuleBuilder
import com.github.darderion.mundaneassignmentpolice.checker.rule.symbol.SymbolRuleBuilder
import com.github.darderion.mundaneassignmentpolice.checker.rule.symbol.and
Expand Down Expand Up @@ -265,3 +267,91 @@ val RULE_ORDER_OF_REFERENCES = RegexRuleBuilder()
referencesInIntList != referencesInIntList.sorted()
}.map { it.second }
}.getRule()

private val ignoringAfterFormula = listOf(
"""\s""".toRegex(),
"""\([0-9]+\)""".toRegex() // ignore formula reference, e.g. "(1)"
)

val fullStopAfterFormulaRule = FormulaPunctuationRuleBuilder()
.called("Отсутствует точка после формулы")
.ignoredWords(*ignoringAfterFormula.toTypedArray())
.rule { formula, filteredText, nextFormula ->
val violationLines = listOf(formula.lines.last())
val lastFormulaSymbol = formula.text.last().text.last()

if (filteredText.isEmpty()) {
return@rule if (lastFormulaSymbol != PunctuationMark.FULL_STOP.value) violationLines else emptyList()
}

// full stop is not required if there is another formula after the formula
val (firstAfterFormula, secondAfterFormula) = filteredText.first() to filteredText.getOrNull(1)
if (nextFormula != null &&
(firstAfterFormula == nextFormula.text.first() ||
firstAfterFormula.text.isPunctuationMark() && secondAfterFormula == nextFormula.text.first())
) {
return@rule emptyList()
}

val indicator = """[A-ZА-Я].*?""".toRegex() // capitalized word that indicates the beginning of a new sentence
if (indicator.matches(firstAfterFormula.text)) {
return@rule if (lastFormulaSymbol != PunctuationMark.FULL_STOP.value) violationLines else emptyList()
}

// case when a punctuation mark is after the formula and not the last symbol of the formula
if (firstAfterFormula.text.isPunctuationMark() &&
secondAfterFormula != null && indicator.matches(secondAfterFormula.text)
) {
return@rule if (firstAfterFormula.text.single() != PunctuationMark.FULL_STOP.value) violationLines
else emptyList()
}

return@rule emptyList()
}
.getRule()

val commaAfterFormulaRule = FormulaPunctuationRuleBuilder()
.called("Отсутствует запятая после формулы")
.ignoredWords(*ignoringAfterFormula.toTypedArray())
.rule { formula, filteredText, nextFormula ->
val violationLines = listOf(formula.lines.last())
val lastFormulaSymbol = formula.text.last().text.last()

if (filteredText.isEmpty()) return@rule emptyList()

// comma is required if there is another formula after the formula
val (firstAfterFormula, secondAfterFormula) = filteredText.first() to filteredText.getOrNull(1)
if (nextFormula != null) {
if (firstAfterFormula == nextFormula.text.first()) {
return@rule if (lastFormulaSymbol == PunctuationMark.COMMA.value ||
lastFormulaSymbol == PunctuationMark.FULL_STOP.value
) emptyList()
else violationLines
}

if (firstAfterFormula.text.isPunctuationMark() && secondAfterFormula == nextFormula.text.first()) {
return@rule if (firstAfterFormula.text.single() == PunctuationMark.COMMA.value ||
firstAfterFormula.text.single() == PunctuationMark.FULL_STOP.value
) emptyList()
else violationLines
}
}

val indicator = """где""".toRegex()
if (indicator.matches(firstAfterFormula.text)) {
return@rule if (lastFormulaSymbol != PunctuationMark.COMMA.value) violationLines else emptyList()
}

// case when a punctuation mark is after the formula and not the last symbol of the formula
if (firstAfterFormula.text.isPunctuationMark() &&
secondAfterFormula != null && indicator.matches(secondAfterFormula.text)
) {
return@rule if (firstAfterFormula.text.single() != PunctuationMark.COMMA.value) violationLines
else emptyList()
}

return@rule emptyList()
}
.getRule()

val RULES_FORMULA_PUNCTUATION = listOf(fullStopAfterFormulaRule, commaAfterFormulaRule)
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,8 @@ class PDFBox {

val strippers = listOf(stripper, textStripper)

var lineIndex = 0
for(pageIndex in (0..document.pages.count)) {
var lineIndex = -1
for (pageIndex in (0 until document.pages.count)) {
// For each page
strippers.forEach {
it.startPage = pageIndex + 1
Expand Down Expand Up @@ -142,7 +142,7 @@ class PDFBox {
contentIndex += contentItem.length

if (contentItem == " ") {
words.add(Word(word, font?: Font(0.0f), coordinates))
words.add(Word(word, font?: Font(), coordinates))
words.add(Word.spaceCharacter)
font = null
word = ""
Expand All @@ -163,7 +163,7 @@ class PDFBox {
stripperIndex++
}
}
if (font == null && word.isEmpty()) font = Font(0.0f)
if (font == null && word.isEmpty()) font = Font()
words.add(Word(word, font!!, coordinates))

Line(line, pageIndex, lineIndex, words.toList())
Expand Down
Loading