diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/getSectionPagesFunction.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/getSectionPagesFunction.kt new file mode 100644 index 00000000..16b83100 --- /dev/null +++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/getSectionPagesFunction.kt @@ -0,0 +1,34 @@ +package com.github.darderion.mundaneassignmentpolice.checker + +import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFDocument + +fun getPages(document: PDFDocument, word : String): Pair +{ + var pages = -1 to -1 + var linesIndexes = -1 to -1 + val lines = document.text.filter { + var isFirstSectionWithWord = true + document.areas!!.sections.forEachIndexed { index , section -> + if (isFirstSectionWithWord) { + if (section.title.contains(word) && word != "Заключение") { + linesIndexes = section.contentIndex to document.areas.sections[index + 1].contentIndex + isFirstSectionWithWord = false + } + else if (section.title.contains(word)) { + linesIndexes = section.contentIndex to -1 + isFirstSectionWithWord = false + } + } + + } + if (word!="Заключение") + linesIndexes.first <= it.documentIndex && it.documentIndex < linesIndexes.second + else linesIndexes.first <= it.documentIndex + } + + if (lines.isNotEmpty() && word!="Заключение") + pages = lines[0].page to lines.last().page + else if (lines.isNotEmpty()) + pages = lines[0].page to -1 + return pages +} \ No newline at end of file diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/line/LineRule.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/line/LineRule.kt new file mode 100644 index 00000000..ed3f24e4 --- /dev/null +++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/line/LineRule.kt @@ -0,0 +1,41 @@ +package com.github.darderion.mundaneassignmentpolice.checker.rule.line + +import com.github.darderion.mundaneassignmentpolice.checker.RuleViolation +import com.github.darderion.mundaneassignmentpolice.checker.RuleViolationType +import com.github.darderion.mundaneassignmentpolice.checker.rule.Rule +import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFDocument +import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFRegion +import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.Line + +class LineRule ( + val singleLinePredicates: MutableList<(line: Line) -> List> = mutableListOf(), + val multipleLinesPredicates : MutableList<(lines: List, document: PDFDocument) -> List>, + val linesFilters : MutableList <(lines: List, document: PDFDocument) -> List>, + type: RuleViolationType, + area: PDFRegion, + name: String + ): Rule(area, name, type) { + + override fun process(document: PDFDocument): List { + val rulesViolations: MutableSet = mutableSetOf() + + var lines = document.text + linesFilters.map { lines = it(lines, document) } + + if (lines.isNotEmpty()) { + singleLinePredicates.map { predicate -> + rulesViolations.addAll( + lines.map { predicate(it) } + .filter { it.isNotEmpty() }.map { + RuleViolation(it, name, type) + } + ) + } + multipleLinesPredicates.map { predicate -> + if (predicate(lines, document).isNotEmpty()) + rulesViolations.add(RuleViolation(predicate(lines, document), name, type)) + } + } + return rulesViolations.toList() + } + } \ No newline at end of file diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/line/LineRuleBuilder.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/line/LineRuleBuilder.kt new file mode 100644 index 00000000..d553dbd4 --- /dev/null +++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/line/LineRuleBuilder.kt @@ -0,0 +1,34 @@ +package com.github.darderion.mundaneassignmentpolice.checker.rule.line + +import com.github.darderion.mundaneassignmentpolice.checker.RuleViolationType +import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFDocument +import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFRegion +import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.Line + +class LineRuleBuilder { + private var region: PDFRegion = PDFRegion.EVERYWHERE + private val singleLinePredicates: MutableList<(line: Line) -> List> = mutableListOf() + private val multipleLinesPredicates : MutableList<(lines: List, document: PDFDocument) -> List> = mutableListOf() + private val linesFilters : MutableList <(lines: List, document: PDFDocument) -> List> = mutableListOf() + private var type: RuleViolationType = RuleViolationType.Error + private var name: String = "Rule name" + + + fun disallowInSingleLine(predicate: (line: Line) -> List) = this.also { singleLinePredicates.add(predicate) } + + fun disallowInMultipleLines(predicate: (lines: List, document: PDFDocument) -> List ) = this.also { multipleLinesPredicates.add(predicate) } + + fun addLinesFilter (predicate: (lines: List, document: PDFDocument) -> List) = this.also { linesFilters.add(predicate) } + + fun called(name: String) = this.also { this.name = name } + + infix fun type(type: RuleViolationType) = this.also { this.type = type } + + fun getRule() = LineRule( + singleLinePredicates, + multipleLinesPredicates, + linesFilters, + type, + region, + name) +} diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/list/ListRule.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/list/ListRule.kt index e8e2a712..21662aa4 100644 --- a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/list/ListRule.kt +++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/list/ListRule.kt @@ -10,8 +10,12 @@ import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFRegion import com.github.darderion.mundaneassignmentpolice.pdfdocument.list.PDFList import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.Line + class ListRule( - val predicates: List<(list: PDFList) -> List>, + val singleListPredicates: MutableList<(list: PDFList) -> List> = mutableListOf(), + val multipleListsPredicates : MutableList<(lists: List>)->List> = mutableListOf(), + val multipleListsPredicatesWithDocument : MutableList<(lists: List>, document: PDFDocument) -> List> = mutableListOf(), + val listsFilter : MutableList<(lists: List>,document: PDFDocument) -> MutableList>> , type: RuleViolationType, area: PDFRegion, name: String @@ -32,9 +36,11 @@ class ListRule( if (area.contains(SECTION)) lists.addAll(document.areas!!.lists) - val pdfLists = lists.map { it.getSublists() }.flatten() + var pdfLists = lists.map { it.getSublists() }.flatten() + + listsFilter.forEach { pdfLists = it(pdfLists, document) } - predicates.forEach { predicate -> + singleListPredicates.forEach { predicate -> rulesViolations.addAll( pdfLists.map { predicate(it) @@ -43,7 +49,13 @@ class ListRule( } ) } - + multipleListsPredicates.forEach { predicate -> + if (predicate(pdfLists).isNotEmpty()) rulesViolations.add(RuleViolation(predicate(pdfLists),name,type)) + } + multipleListsPredicatesWithDocument.forEach { predicate -> + if (predicate(pdfLists,document).isNotEmpty()) + rulesViolations.add(RuleViolation(predicate(pdfLists,document),name,type)) + } return rulesViolations.toList() } } diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/list/ListRuleBuilder.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/list/ListRuleBuilder.kt index 201db9d6..6316531a 100644 --- a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/list/ListRuleBuilder.kt +++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/checker/rule/list/ListRuleBuilder.kt @@ -2,18 +2,24 @@ package com.github.darderion.mundaneassignmentpolice.checker.rule.list import com.github.darderion.mundaneassignmentpolice.checker.RuleViolationType import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFArea +import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFDocument import com.github.darderion.mundaneassignmentpolice.pdfdocument.list.PDFList import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFRegion import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.Line class ListRuleBuilder { private var region: PDFRegion = PDFRegion.EVERYWHERE - private val predicates: MutableList<(list: PDFList) -> List> = mutableListOf() + private val singleListPredicates: MutableList<(list: PDFList) -> List> = mutableListOf() + private val multipleListsPredicates : MutableList<(lists: List>)->List> = mutableListOf() + private val multipleListsPredicatesWithDocument : MutableList<(lists: List>, document: PDFDocument) -> List> = mutableListOf() + private val listsFilter : MutableList <(lists: List>,document: PDFDocument) -> MutableList>> = mutableListOf() private var type: RuleViolationType = RuleViolationType.Error private var name: String = "Rule name" - fun disallow(predicate: (list: PDFList) -> List) = this.also { predicates.add(predicate) } - + fun disallowInSingleList(predicate: (list: PDFList) -> List) = this.also { singleListPredicates.add(predicate) } + fun disallowInMultipleLists(predicate: (lists: List>) -> List) = this.also { multipleListsPredicates.add(predicate) } + fun disallowInMultipleListsWithDocument(predicate: (lists: List>, document: PDFDocument) -> List ) = this.also { multipleListsPredicatesWithDocument.add(predicate) } + fun addListsFilter (predicate: (lists: List>, document: PDFDocument) -> MutableList>) = this.also { listsFilter.add(predicate) } infix fun inArea(area: PDFArea) = this.also { region = PDFRegion.NOWHERE.except(area) } infix fun inArea(region: PDFRegion) = this.also { this.region = region } @@ -22,5 +28,11 @@ class ListRuleBuilder { infix fun type(type: RuleViolationType) = this.also { this.type = type } - fun getRule() = ListRule(predicates, type, region, name) + fun getRule() = ListRule(singleListPredicates, + multipleListsPredicates, + multipleListsPredicatesWithDocument, + listsFilter, + type, + region, + name) } diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/rules/RuleSet.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/rules/RuleSet.kt index ec5708da..15afab75 100644 --- a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/rules/RuleSet.kt +++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/rules/RuleSet.kt @@ -13,6 +13,8 @@ val RULE_SET_RU = RuleSet( RULE_MULTIPLE_LITLINKS, RULE_BRACKETS_LETTERS, RULE_CITATION, + RULE_NO_TASKS, + RULE_TASKS_MAPPING, RULE_SINGLE_SUBSECTION, RULE_TABLE_OF_CONTENT_NUMBERS, RULE_SYMBOLS_IN_SECTION_NAMES, @@ -23,6 +25,7 @@ val RULE_SET_RU = RuleSet( RULE_SECTIONS_ORDER, RULE_LOW_QUALITY_CONFERENCES, ) + + RULE_CONFIGURATION_IN_EXPERIMENTS + RULES_SPACE_AROUND_BRACKETS + RULES_SMALL_NUMBERS ) diff --git a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/rules/Rules.kt b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/rules/Rules.kt index e0847e9c..e6acb8c6 100644 --- a/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/rules/Rules.kt +++ b/src/main/kotlin/com/github/darderion/mundaneassignmentpolice/rules/Rules.kt @@ -2,8 +2,10 @@ package com.github.darderion.mundaneassignmentpolice.rules import com.github.darderion.mundaneassignmentpolice.checker.RuleViolationType import com.github.darderion.mundaneassignmentpolice.checker.Section +import com.github.darderion.mundaneassignmentpolice.checker.getPages import com.github.darderion.mundaneassignmentpolice.checker.rule.list.ListRuleBuilder import com.github.darderion.mundaneassignmentpolice.checker.rule.regex.RegexRuleBuilder +import com.github.darderion.mundaneassignmentpolice.checker.rule.line.LineRuleBuilder import com.github.darderion.mundaneassignmentpolice.checker.rule.symbol.SymbolRuleBuilder import com.github.darderion.mundaneassignmentpolice.checker.rule.symbol.and import com.github.darderion.mundaneassignmentpolice.checker.rule.symbol.or @@ -14,9 +16,12 @@ import com.github.darderion.mundaneassignmentpolice.checker.rule.word.WordRuleBu import com.github.darderion.mundaneassignmentpolice.checker.rule.word.or import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFArea import com.github.darderion.mundaneassignmentpolice.pdfdocument.PDFRegion +import com.github.darderion.mundaneassignmentpolice.pdfdocument.list.PDFList +import com.github.darderion.mundaneassignmentpolice.pdfdocument.text.Line import com.github.darderion.mundaneassignmentpolice.utils.InvalidOperationException import com.github.darderion.mundaneassignmentpolice.utils.LowQualityConferencesUtil import com.github.darderion.mundaneassignmentpolice.utils.URLUtil +import java.io.File import java.util.* private val enLetters = "abcdefghijklmnopqrstuvwxyz" @@ -30,12 +35,12 @@ private val RU = rusLetters + rusCapitalLetters private val numbers = "0123456789" val RULE_LITLINK = SymbolRuleBuilder() - .symbol('?') - .ignoringAdjusting(*" ,$numbers".toCharArray()) - .shouldNotHaveNeighbor(*"[]".toCharArray()) - //.called("Symbol '?' in litlink") - .called("Символ ? в ссылке на литературу") - .getRule() + .symbol('?') + .ignoringAdjusting(*" ,$numbers".toCharArray()) + .shouldNotHaveNeighbor(*"[]".toCharArray()) + //.called("Symbol '?' in litlink") + .called("Символ ? в ссылке на литературу") + .getRule() val shortDash = '-' @@ -43,85 +48,85 @@ val shortDash = '-' // one-sided battle val shortDashRules = SymbolRuleBuilder() - .symbol(shortDash) - .shouldHaveNeighbor(*EN.toCharArray()) - .shouldHaveNeighbor(*RU.toCharArray()) - .shouldHaveNeighbor(*numbers.toCharArray()) - //.called("Incorrect usage of '-' symbol") - .called("Неправильное использование дефиса") - .inArea(PDFRegion.EVERYWHERE.except(PDFArea.BIBLIOGRAPHY, PDFArea.FOOTNOTE)) + .symbol(shortDash) + .shouldHaveNeighbor(*EN.toCharArray()) + .shouldHaveNeighbor(*RU.toCharArray()) + .shouldHaveNeighbor(*numbers.toCharArray()) + //.called("Incorrect usage of '-' symbol") + .called("Неправильное использование дефиса") + .inArea(PDFRegion.EVERYWHERE.except(PDFArea.BIBLIOGRAPHY, PDFArea.FOOTNOTE)) val RULE_SHORT_DASH = shortDashRules.getRule() and ( - shortDashRules.fromLeft().shouldHaveNeighbor('.') - .shouldNotHaveNeighbor(*numbers.toCharArray()).getRule() or - shortDashRules.fromRight().shouldHaveNeighbor('\n') - .shouldNotHaveNeighbor(*numbers.toCharArray()).getRule() - ) + shortDashRules.fromLeft().shouldHaveNeighbor('.') + .shouldNotHaveNeighbor(*numbers.toCharArray()).getRule() or + shortDashRules.fromRight().shouldHaveNeighbor('\n') + .shouldNotHaveNeighbor(*numbers.toCharArray()).getRule() + ) val mediumDash = '–' val RULE_MEDIUM_DASH = SymbolRuleBuilder() - .symbol(mediumDash) - .shouldHaveNeighbor(*numbers.toCharArray()) - //.called("Incorrect usage of '--' symbol") - .called("Неправильное использование короткого тире") - .inArea(PDFRegion.EVERYWHERE.except(PDFArea.BIBLIOGRAPHY, PDFArea.FOOTNOTE)) - .ignoringIfIndex(0) - .getRule() + .symbol(mediumDash) + .shouldHaveNeighbor(*numbers.toCharArray()) + //.called("Incorrect usage of '--' symbol") + .called("Неправильное использование короткого тире") + .inArea(PDFRegion.EVERYWHERE.except(PDFArea.BIBLIOGRAPHY, PDFArea.FOOTNOTE)) + .ignoringIfIndex(0) + .getRule() val longDash = '—' val RULE_LONG_DASH = SymbolRuleBuilder() - .symbol(longDash) - .ignoringAdjusting(' ') - .shouldNotHaveNeighbor(*numbers.toCharArray()) - //.called("Incorrect usage of '---' symbol") - .called("Неправильное использование длинного тире") - .inArea(PDFRegion.EVERYWHERE.except(PDFArea.BIBLIOGRAPHY, PDFArea.FOOTNOTE)) - .getRule() and SymbolRuleBuilder() - .symbol(longDash) - .shouldHaveNeighbor(' ') - .inArea(PDFRegion.EVERYWHERE.except(PDFArea.BIBLIOGRAPHY, PDFArea.FOOTNOTE)) - .getRule() + .symbol(longDash) + .ignoringAdjusting(' ') + .shouldNotHaveNeighbor(*numbers.toCharArray()) + //.called("Incorrect usage of '---' symbol") + .called("Неправильное использование длинного тире") + .inArea(PDFRegion.EVERYWHERE.except(PDFArea.BIBLIOGRAPHY, PDFArea.FOOTNOTE)) + .getRule() and SymbolRuleBuilder() + .symbol(longDash) + .shouldHaveNeighbor(' ') + .inArea(PDFRegion.EVERYWHERE.except(PDFArea.BIBLIOGRAPHY, PDFArea.FOOTNOTE)) + .getRule() val closingQuote = '”' val openingQuote = '“' val RULE_CLOSING_QUOTATION = SymbolRuleBuilder() - .symbol(closingQuote) - .ignoringEveryCharacterExcept(*"$closingQuote$openingQuote".toCharArray()) - .fromLeft().shouldHaveNeighbor(openingQuote) - .inNeighborhood(20) - .called("Неправильное использование закрывающей кавычки") - .getRule() + .symbol(closingQuote) + .ignoringEveryCharacterExcept(*"$closingQuote$openingQuote".toCharArray()) + .fromLeft().shouldHaveNeighbor(openingQuote) + .inNeighborhood(20) + .called("Неправильное использование закрывающей кавычки") + .getRule() val RULE_OPENING_QUOTATION = SymbolRuleBuilder() - .symbol(openingQuote) - .ignoringEveryCharacterExcept(*"$closingQuote$openingQuote".toCharArray()) - .fromRight().shouldHaveNeighbor(closingQuote) - .inNeighborhood(20) - .called("Неправильное использование открывающей кавычки") - .getRule() + .symbol(openingQuote) + .ignoringEveryCharacterExcept(*"$closingQuote$openingQuote".toCharArray()) + .fromRight().shouldHaveNeighbor(closingQuote) + .inNeighborhood(20) + .called("Неправильное использование открывающей кавычки") + .getRule() const val squareClosingBracket = ']' const val squareOpeningBracket = '[' val RULE_MULTIPLE_LITLINKS = SymbolRuleBuilder() - .symbol(squareClosingBracket) - .ignoringAdjusting(' ', ',') - .fromRight().shouldNotHaveNeighbor(squareOpeningBracket) - .called("Неправильное оформление нескольких ссылок") - .getRule() + .symbol(squareClosingBracket) + .ignoringAdjusting(' ', ',') + .fromRight().shouldNotHaveNeighbor(squareOpeningBracket) + .called("Неправильное оформление нескольких ссылок") + .getRule() const val bracket = '(' val RULE_BRACKETS_LETTERS = SymbolRuleBuilder() - .symbol(bracket) - .ignoringAdjusting(' ') - .fromRight().shouldNotHaveNeighbor(*rusCapitalLetters.toCharArray()) - .called("Большая русская буква после скобки") - .type(RuleViolationType.Warning) - .getRule() + .symbol(bracket) + .ignoringAdjusting(' ') + .fromRight().shouldNotHaveNeighbor(*rusCapitalLetters.toCharArray()) + .called("Большая русская буква после скобки") + .type(RuleViolationType.Warning) + .getRule() private const val openingBrackets = "([{<" private const val closingBrackets = ")]}>" @@ -129,250 +134,353 @@ private const val closingQuotes = "”»" private const val punctuationSymbols = ".,;:!?" private val spaceAroundBracketsRuleBuilders = List(2) { SymbolRuleBuilder() } - .map { it.shouldHaveNeighbor(' ', '\n') } - .map { it.called("Отсутствует пробел с внешней стороны скобок") } - .apply { - // setting up a rule that should look for a space before opening brackets - first().fromLeft().ignoringAdjusting(*openingBrackets.toCharArray()) - // and this rule should look for after closing brackets - last().fromRight() - .ignoringAdjusting(*"$punctuationSymbols$closingQuotes$closingBrackets".toCharArray()) - } + .map { it.shouldHaveNeighbor(' ', '\n') } + .map { it.called("Отсутствует пробел с внешней стороны скобок") } + .apply { + // setting up a rule that should look for a space before opening brackets + first().fromLeft().ignoringAdjusting(*openingBrackets.toCharArray()) + // and this rule should look for after closing brackets + last().fromRight() + .ignoringAdjusting(*"$punctuationSymbols$closingQuotes$closingBrackets".toCharArray()) + } // For case when round brackets are empty: "function()" private val openingRoundBracketExceptionalRule = SymbolRuleBuilder() - .symbol('(') - .fromRight().shouldHaveNeighbor(')') - .getRule() + .symbol('(') + .fromRight().shouldHaveNeighbor(')') + .getRule() val RULES_SPACE_AROUND_BRACKETS = spaceAroundBracketsRuleBuilders - .zip(listOf(openingBrackets, closingBrackets).map { it.toCharArray() }) - .map { pair -> pair.second.map { pair.first.symbol(it).getRule() } } - .flatten() - .map { - if (it.symbol == '(') it or openingRoundBracketExceptionalRule - else it - } + .zip(listOf(openingBrackets, closingBrackets).map { it.toCharArray() }) + .map { pair -> pair.second.map { pair.first.symbol(it).getRule() } } + .flatten() + .map { + if (it.symbol == '(') it or openingRoundBracketExceptionalRule + else it + } val RULE_CITATION = SymbolRuleBuilder() - .symbol('[') - .ignoringAdjusting(' ', '\n') - .fromLeft().shouldNotHaveNeighbor('.') - .called("Некорректное цитирование") - .inArea(PDFArea.SECTION) - .getRule() + .symbol('[') + .ignoringAdjusting(' ', '\n') + .fromLeft().shouldNotHaveNeighbor('.') + .called("Некорректное цитирование") + .inArea(PDFArea.SECTION) + .getRule() val RULE_SINGLE_SUBSECTION = ListRuleBuilder() - .inArea(PDFRegion.NOWHERE.except(PDFArea.TABLE_OF_CONTENT)) - //.called("Only 1 subsection in a section") - .called("Одна подсекция в секции") - .disallow { - if (it.nodes.count() == 1) it.nodes.first().getText() else listOf() - }.getRule() + .inArea(PDFRegion.NOWHERE.except(PDFArea.TABLE_OF_CONTENT)) + //.called("Only 1 subsection in a section") + .called("Одна подсекция в секции") + .disallowInSingleList { + if (it.nodes.count() == 1) it.nodes.first().getText() else listOf() + }.getRule() + +val RULE_TASKS_MAPPING = ListRuleBuilder() + .inArea(PDFArea.SECTION) + .called("Задачи и результаты не совпадают") + .addListsFilter { _, document -> + val newLists = mutableListOf>() + val tasks = mutableListOf>() + val results = mutableListOf>() + val taskPages = getPages(document, "адач") // Задачи, задачи, задач, задача + val conclusionPages = getPages(document, "Заключение") + if (taskPages != -1 to -1 && conclusionPages != -1 to -1) { + document.areas!!.lists.forEach { + if (it.getText()[0].page >= taskPages.first && it.getText()[0].page < taskPages.second) + tasks.add(it) + if (it.getText()[0].page >= conclusionPages.first) + results.add(it) + } + } + newLists.addAll(tasks) + newLists.addAll(results) + newLists + } + .disallowInMultipleListsWithDocument { lists, document -> + val taskPages = getPages(document,"адач") + val conclusionPages = getPages(document, "Заключение") + var tasks = mutableListOf>() + var results = mutableListOf>() + + lists.forEach { + if (it.getText()[0].page >= taskPages.first && it.getText()[0].page < taskPages.second) + tasks.add(it) + if (it.getText()[0].page >= conclusionPages.first) + results.add(it) + } + if (taskPages == -1 to -1 || conclusionPages == -1 to -1) listOf() + else { + val tasksAndResultsSections = document.areas!!.sections + .filter { it.title.contains("адач") || it.title.contains("Заключение") } + if (tasks.isEmpty() && results.isEmpty()) { + listOf( // underline "задачи" "Заключение" + document.text[tasksAndResultsSections.first().titleIndex], + document.text[tasksAndResultsSections.last().titleIndex] + ) + } else if (tasks.isEmpty() && results.isNotEmpty()) + listOf(document.text[tasksAndResultsSections.first().titleIndex]) //underline "задачи" + else if (tasks.isNotEmpty() && results.isEmpty()) + listOf(document.text[tasksAndResultsSections.last().titleIndex])//underline "Заключение" + else if (tasks.isNotEmpty() && results.isNotEmpty()) { + if (results.size != results.filter { it.nodes.size < tasks[0].nodes.size }.toMutableList().size) + listOf() + else { + results = results.filter { it.nodes.size < tasks[0].nodes.size }.toMutableList() + results[0].getText()//all lists in conclusion are less than task lists + //underline first list in conclusion + } + } else listOf() + } + } + .getRule() + +val RULE_NO_TASKS = TableOfContentRuleBuilder() + .called("Задачи не выделены в содержании") + .disallow { + val tasks = it.filter { it.text.toString().contains("адач") } + if (tasks.isEmpty()) listOf(it.first()) else listOf() + } + .getRule() val RULE_TABLE_OF_CONTENT_NUMBERS = TableOfContentRuleBuilder() - .disallow { - it.filter { - // println("${it.text.count()} -> ${it.content}") - val text = it.text.filter { it.text.trim().isNotEmpty() } - ((text.count() == 3 && (text[1].text == Section.INTRODUCTION.title || - text[1].text == Section.CONCLUSION.title)) || - (text.count() == 4 && (text[1].text + " " + text[2].text) == Section.BIBLIOGRAPHY.title)) - } - }.called("Введение, заключение и список литературы не нумеруются") - .getRule() + .disallow { + it.filter { + // println("${it.text.count()} -> ${it.content}") + val text = it.text.filter { it.text.trim().isNotEmpty() } + ((text.count() == 3 && (text[1].text == Section.INTRODUCTION.title || + text[1].text == Section.CONCLUSION.title)) || + (text.count() == 4 && (text[1].text + " " + text[2].text) == Section.BIBLIOGRAPHY.title)) + } + }.called("Введение, заключение и список литературы не нумеруются") + .getRule() val RULE_SYMBOLS_IN_SECTION_NAMES = TableOfContentRuleBuilder() - .disallow { listOfLines -> - listOfLines.filter { line -> - val text = line.text.filterNot { it.text == "." } // remove leaders - .filterNot { it.text.contains("[0-9]+\\.".toRegex()) } // remove numbering - .joinToString("") - text.contains("[:.,]".toRegex()) - } - }.called("""Символы ":", ".", "," в названии секции""") - .getRule() + .disallow { listOfLines -> + listOfLines.filter { line -> + val text = line.text.filterNot { it.text == "." } // remove leaders + .filterNot { it.text.contains("[0-9]+\\.".toRegex()) } // remove numbering + .joinToString("") + text.contains("[:.,]".toRegex()) + } + }.called("""Символы ":", ".", "," в названии секции""") + .getRule() val sectionsThatMayPrecedeThis = mapOf>( - Section.INTRODUCTION.title to hashSetOf(""), - Section.PROBLEM_STATEMENT.title to hashSetOf(Section.INTRODUCTION.title), - Section.REVIEW.title to hashSetOf(Section.PROBLEM_STATEMENT.title), - Section.CONTENT.title to hashSetOf(Section.REVIEW.title, Section.CONTENT.title), - Section.CONCLUSION.title to hashSetOf(Section.CONTENT.title), - Section.BIBLIOGRAPHY.title to hashSetOf(Section.CONCLUSION.title) + Section.INTRODUCTION.title to hashSetOf(""), + Section.PROBLEM_STATEMENT.title to hashSetOf(Section.INTRODUCTION.title), + Section.REVIEW.title to hashSetOf(Section.PROBLEM_STATEMENT.title), + Section.CONTENT.title to hashSetOf(Section.REVIEW.title, Section.CONTENT.title), + Section.CONCLUSION.title to hashSetOf(Section.CONTENT.title), + Section.BIBLIOGRAPHY.title to hashSetOf(Section.CONCLUSION.title) ) val RULE_SECTIONS_ORDER = TableOfContentRuleBuilder() - .disallow { listOfLines -> - var nameOfPreviousSection = "" - listOfLines - .filterNot { line -> - val words = line.text - .filter { it.text.trim().isNotEmpty() } - .filterNot { it.text.contains("[0-9]+\\.".toRegex()) } // remove numbering - words.isEmpty() || words[0].text == Section.TABLE_OF_CONTENT.title - } - .filter { line -> - val words = line.text - .filter { it.text.trim().isNotEmpty() } - .filterNot { it.text.contains("[0-9]+\\.".toRegex()) } // remove numbering - - val sectionName = - if ((words[0].text + " " + words[1].text) == Section.BIBLIOGRAPHY.title || - (words[0].text + " " + words[1].text) == Section.PROBLEM_STATEMENT.title - ) - words[0].text + " " + words[1].text - else if (sectionsThatMayPrecedeThis.contains(words[0].text)) - words[0].text - else - Section.CONTENT.title - - val isRuleViolation = - !sectionsThatMayPrecedeThis[sectionName]!!.contains(nameOfPreviousSection) - nameOfPreviousSection = sectionName - isRuleViolation - } - } - .called("Неверный порядок секций") - .getRule() + .disallow { listOfLines -> + var nameOfPreviousSection = "" + listOfLines + .filterNot { line -> + val words = line.text + .filter { it.text.trim().isNotEmpty() } + .filterNot { it.text.contains("[0-9]+\\.".toRegex()) } // remove numbering + words.isEmpty() || words[0].text == Section.TABLE_OF_CONTENT.title + } + .filter { line -> + val words = line.text + .filter { it.text.trim().isNotEmpty() } + .filterNot { it.text.contains("[0-9]+\\.".toRegex()) } // remove numbering + + val sectionName = + if ((words[0].text + " " + words[1].text) == Section.BIBLIOGRAPHY.title || + (words[0].text + " " + words[1].text) == Section.PROBLEM_STATEMENT.title + ) + words[0].text + " " + words[1].text + else if (sectionsThatMayPrecedeThis.contains(words[0].text)) + words[0].text + else + Section.CONTENT.title + + val isRuleViolation = + !sectionsThatMayPrecedeThis[sectionName]!!.contains(nameOfPreviousSection) + nameOfPreviousSection = sectionName + isRuleViolation + } + } + .called("Неверный порядок секций") + .getRule() val smallNumbersRuleName = "Неправильное написание целых чисел от 1 до 9" val smallNumbersRuleArea = - PDFRegion.EVERYWHERE.except(PDFArea.PAGE_INDEX, PDFArea.TABLE_OF_CONTENT, PDFArea.BIBLIOGRAPHY) + PDFRegion.EVERYWHERE.except(PDFArea.PAGE_INDEX, PDFArea.TABLE_OF_CONTENT, PDFArea.BIBLIOGRAPHY) val allowedWordsOnLeft = arrayOf( - Regex("""[Рр]ис[a-я]*"""), - Regex("""[Тт]абл[a-я]*"""), Regex("""[Сс]х[a-я]*"""), - Regex("""[Dd]ef[a-z]*"""), Regex("""[Оо]пр[а-я]*"""), - Regex("""[Tt]h[a-z]*"""), Regex("""[Тт]еорема""") + Regex("""[Рр]ис[a-я]*"""), + Regex("""[Тт]абл[a-я]*"""), Regex("""[Сс]х[a-я]*"""), + Regex("""[Dd]ef[a-z]*"""), Regex("""[Оо]пр[а-я]*"""), + Regex("""[Tt]h[a-z]*"""), Regex("""[Тт]еорема""") ) val allowedWordsOnRight = arrayOf( - Regex("""[Gg][Bb]"""), Regex("""[Гг][Бб]"""), - Regex("""[Mm][Bb]"""), Regex("""[Мм][Бб]"""), - Regex("""[Gg][Hh][Zz]"""), Regex("""[Гг][Цц]"""), - Regex("""→""") + Regex("""[Gg][Bb]"""), Regex("""[Гг][Бб]"""), + Regex("""[Mm][Bb]"""), Regex("""[Мм][Бб]"""), + Regex("""[Gg][Hh][Zz]"""), Regex("""[Гг][Цц]"""), + Regex("""→""") ) -val smallNumbersRuleBuilder1 = WordRuleBuilder() //for nearest words - .called(smallNumbersRuleName) - .inArea(smallNumbersRuleArea) - .ignoringAdjusting(Regex("""\s"""), Regex("""\.""")) - .ignoringIfIndex(0) - -val smallNumbersRuleBuilder2 = WordRuleBuilder() //for decimal fractions and version numbers - .called(smallNumbersRuleName) - .inArea(smallNumbersRuleArea) - .shouldHaveNeighbor(Regex("""\."""), Regex(""","""), - Regex("""[0-9]+""")) - .shouldHaveNumberOfNeighbors(2) - -val smallNumbersRuleBuilder3 = WordRuleBuilder() //for links - .called(smallNumbersRuleName) - .inArea(smallNumbersRuleArea) - .fromLeft() - .ignoringWords(true) - .ignoringAdjusting(Regex(""","""), Regex("""\s""")) - .shouldHaveNeighbor(Regex("""\[""")) +val smallNumbersRuleBuilder1 = WordRuleBuilder() //for nearest words + .called(smallNumbersRuleName) + .inArea(smallNumbersRuleArea) + .ignoringAdjusting(Regex("""\s"""), Regex("""\.""")) + .ignoringIfIndex(0) + +val smallNumbersRuleBuilder2 = WordRuleBuilder() //for decimal fractions and version numbers + .called(smallNumbersRuleName) + .inArea(smallNumbersRuleArea) + .shouldHaveNeighbor( + Regex("""\."""), Regex(""","""), + Regex("""[0-9]+""") + ) + .shouldHaveNumberOfNeighbors(2) + +val smallNumbersRuleBuilder3 = WordRuleBuilder() //for links + .called(smallNumbersRuleName) + .inArea(smallNumbersRuleArea) + .fromLeft() + .ignoringWords(true) + .ignoringAdjusting(Regex(""","""), Regex("""\s""")) + .shouldHaveNeighbor(Regex("""\[""")) val RULES_SMALL_NUMBERS = List(9) { index -> - smallNumbersRuleBuilder1.word((index + 1).toString()) - .fromLeft().shouldHaveNeighbor(*allowedWordsOnLeft).getRule() or - smallNumbersRuleBuilder1.word((index + 1).toString()) - .fromRight().shouldHaveNeighbor(*allowedWordsOnRight).getRule() or - smallNumbersRuleBuilder2.word((index + 1).toString()).fromLeft().getRule() or - smallNumbersRuleBuilder2.fromRight().getRule() or - smallNumbersRuleBuilder3.word((index + 1).toString()).getRule() + smallNumbersRuleBuilder1.word((index + 1).toString()) + .fromLeft().shouldHaveNeighbor(*allowedWordsOnLeft).getRule() or + smallNumbersRuleBuilder1.word((index + 1).toString()) + .fromRight().shouldHaveNeighbor(*allowedWordsOnRight).getRule() or + smallNumbersRuleBuilder2.word((index + 1).toString()).fromLeft().getRule() or + smallNumbersRuleBuilder2.fromRight().getRule() or + smallNumbersRuleBuilder3.word((index + 1).toString()).getRule() } val RULE_SHORTENED_URLS = URLRuleBuilder() - .called("Сокращённая ссылка") - .inArea(PDFRegion.NOWHERE.except(PDFArea.FOOTNOTE, PDFArea.BIBLIOGRAPHY)) - .disallow { urls -> - urls.filter { pair -> - try { - var url = pair.first - if (!url.startsWith("http")) url = "http://$url" - URLUtil.isShortened(url) - } catch (_: InvalidOperationException) { - false - } - }.map { it.second } - }.getRule() + .called("Сокращённая ссылка") + .inArea(PDFRegion.NOWHERE.except(PDFArea.FOOTNOTE, PDFArea.BIBLIOGRAPHY)) + .disallow { urls -> + urls.filter { pair -> + try { + var url = pair.first + if (!url.startsWith("http")) url = "http://$url" + URLUtil.isShortened(url) + } catch (_: InvalidOperationException) { + false + } + }.map { it.second } + }.getRule() val RULE_URLS_UNIFORMITY = URLRuleBuilder() - .called("Ссылки разных видов") - .disallow { urls -> - var filteredUrls = urls.filter { pair -> - val url = pair.first - !url.startsWith("https://www") - } - if (urls.size == filteredUrls.size) { - filteredUrls = filteredUrls.filter { pair -> - val url = pair.first - !url.startsWith("www") - } - if (urls.size == filteredUrls.size) { - filteredUrls = filteredUrls.filter { pair -> - val url = pair.first - !url.startsWith("htt") - } - } - } - filteredUrls.map { it.second } - }.getRule() + .called("Ссылки разных видов") + .disallow { urls -> + var filteredUrls = urls.filter { pair -> + val url = pair.first + !url.startsWith("https://www") + } + if (urls.size == filteredUrls.size) { + filteredUrls = filteredUrls.filter { pair -> + val url = pair.first + !url.startsWith("www") + } + if (urls.size == filteredUrls.size) { + filteredUrls = filteredUrls.filter { pair -> + val url = pair.first + !url.startsWith("htt") + } + } + } + filteredUrls.map { it.second } + }.getRule() val RULE_ORDER_OF_REFERENCES = RegexRuleBuilder() - .called("Неверный порядок ссылок на литературу") - .regex(Regex("""\[[0-9,\s]+\]""")) - .searchIn(1) - .disallow { matches -> - matches.filter { pair -> - val references = pair.first - val referencesInIntList = references - .slice(IntRange(1, references.length - 2)) - .split(Regex(""",""")) - .map { it.trim() } - .filter { it.isNotEmpty() } - .map { it.toInt() } - referencesInIntList != referencesInIntList.sorted() - }.map { it.second } - }.getRule() + .called("Неверный порядок ссылок на литературу") + .regex(Regex("""\[[0-9,\s]+\]""")) + .searchIn(1) + .disallow { matches -> + matches.filter { pair -> + val references = pair.first + val referencesInIntList = references + .slice(IntRange(1, references.length - 2)) + .split(Regex(""",""")) + .map { it.trim() } + .filter { it.isNotEmpty() } + .map { it.toInt() } + referencesInIntList != referencesInIntList.sorted() + }.map { it.second } + }.getRule() val RULE_VARIOUS_ABBREVIATIONS = RegexRuleBuilder() - .called("Использованы различные версии сокращения") - .regex(Regex("""[a-zA-Zа-яА-Я]+""")) - .inArea(PDFRegion.EVERYWHERE.except(PDFArea.BIBLIOGRAPHY)) - .disallow { matches -> - val abbreviations = hashSetOf() - val allWords = hashMapOf>() - matches.forEach { pair -> - val word = pair.first - if (word.slice(IntRange(1, word.length - 1)) - .count { it.isUpperCase() } > 0) - abbreviations.add(word.uppercase()) - if (!allWords.containsKey(word.lowercase())) - allWords.put(word.lowercase(), hashSetOf()) - allWords[word.lowercase()]?.add(word) - } - matches.filter { pair -> - val word = pair.first - if (abbreviations.contains(word.uppercase())) - allWords[word.lowercase()]?.size!! > 1 - else - false - }.map { it.second } - }.getRule() + .called("Использованы различные версии сокращения") + .regex(Regex("""[a-zA-Zа-яА-Я]+""")) + .inArea(PDFRegion.EVERYWHERE.except(PDFArea.BIBLIOGRAPHY)) + .disallow { matches -> + val abbreviations = hashSetOf() + val allWords = hashMapOf>() + matches.forEach { pair -> + val word = pair.first + if (word.slice(IntRange(1, word.length - 1)) + .count { it.isUpperCase() } > 0 + ) + abbreviations.add(word.uppercase()) + if (!allWords.containsKey(word.lowercase())) + allWords.put(word.lowercase(), hashSetOf()) + allWords[word.lowercase()]?.add(word) + } + matches.filter { pair -> + val word = pair.first + if (abbreviations.contains(word.uppercase())) + allWords[word.lowercase()]?.size!! > 1 + else + false + }.map { it.second } + }.getRule() val RULE_LOW_QUALITY_CONFERENCES = URLRuleBuilder() - .called("Ссылка на низкокачественную конференцию") - .inArea(PDFArea.BIBLIOGRAPHY) - .disallow { urls -> - val lowQualityConferencesList = LowQualityConferencesUtil.getList() - .map { - it.removePrefix("http://").removePrefix("https://") - } - urls.filter { pair -> - val url = pair.first - lowQualityConferencesList - .any { conference -> url.contains(conference) } - }.map { it.second } - }.getRule() + .called("Ссылка на низкокачественную конференцию") + .inArea(PDFArea.BIBLIOGRAPHY) + .disallow { urls -> + val lowQualityConferencesList = LowQualityConferencesUtil.getList() + .map { + it.removePrefix("http://").removePrefix("https://") + } + urls.filter { pair -> + val url = pair.first + lowQualityConferencesList + .any { conference -> url.contains(conference) } + }.map { it.second } + }.getRule() + +const val precisionWordCount = 5 +const val fileConfigurationWordsName= "src/main/resources/HardSoftConfigurationWords.txt" +val RULE_CONFIGURATION_IN_EXPERIMENTS = LineRuleBuilder() + .addLinesFilter { _, document -> + var experimentsPages = getPages(document, "ксперимент").first to //Эксперимент, экспериментов, эксперимент + getPages(document, "Заключение").first + if (experimentsPages.first == -1) + experimentsPages = getPages(document, "Тестирование").first to getPages(document, "Заключение").first + + if (experimentsPages.first != -1) { + document.text.filter { line -> + line.text.isNotEmpty() && line.page >= experimentsPages.first && line.page < experimentsPages.second + } + } else listOf() + } + .disallowInMultipleLines { lines, _ -> + var wordCount = 0 + val hardConfigurationWords: MutableList = mutableListOf() + File(fileConfigurationWordsName).forEachLine { hardConfigurationWords.add(it) } + + lines.map { line -> line.text.map { + val word = it.text + hardConfigurationWords.map { if (word.contains(it)) wordCount += 1 } + } + } + + if (wordCount < precisionWordCount) + return@disallowInMultipleLines listOf(lines.first()) + else listOf() + } + .called("Нет hard/soft конфигурации в экспериментах") + .getRule() diff --git a/src/main/resources/HardSoftConfigurationWords.txt b/src/main/resources/HardSoftConfigurationWords.txt new file mode 100644 index 00000000..dd1377da --- /dev/null +++ b/src/main/resources/HardSoftConfigurationWords.txt @@ -0,0 +1,159 @@ +процессор +Процессор +i7 +i5 +GPU +операцион +Операцион +видеокарт +Видеокарт +контроллер +Контроллер +ОЗУ +блок +Блок +ЦП +сервер +Сервер +аппарат +Аппарат +Xeon +IntelCore +Core +core +ядер +ядр +Ядр +ГБ +ГГц +Гц +байт +Мбайт +ATA +BIOS +UEFI +DMA +IDE +ARM +ASUS +RHEEM +PCI +PCI-E +USB +FireWire +SCSI +SATA +SAS +сontroller +VDC +CRTC +Intel +intel +IBM +SCH +FSB +Hyper +Transport +QPI +northbridge +southbridge +ZIF +CISC +RISC +MISC +многоядер +Многоядер +одноядер +Одноядер +SISD +SIMD +MISD +MIMD +POST +ASUS +Acer +буфер +Буфер +Systems +жёстк +Жёстк +ATX +SSD +SSHD +Samsung +DMA +CPU +GTX +GeForce +IPv4 +IPv6 +ПО +ОС +GNU +Linux +Windows +Unix +Ubuntu +Docker +GoogleTest +Test +Java +.NET +C# +C++ +Perl +Python +Kotlin +OpenCV +MATLAB +Apache +Spark +PostgreSQL +DMBS +SQL +F# +JavaStreams +Postgres +Spark +Flink +Precision +Recall +PosDB +macOS +GraphX +Giraph +JGraph +HDFS +IDE +DOS +SDK +Mac +Google +Chrome +Opera +ISO +Mint +Debian +Fedora +ArchLinux +Mozila +Safari +ESP +OSI +ProFuzzBench +OpenCL +OpenCV +Opera +Microsoft +CLIPS +OpenCyc +веб +CMS +HTML +БД +CAD +DDK +СУБД +Android +Xiaomi +LG \ No newline at end of file