From 729f2286cf60f8f5f10a2a0f7acefe7930abe3bc Mon Sep 17 00:00:00 2001 From: Ravf95 Date: Fri, 26 Mar 2021 18:27:03 -0300 Subject: [PATCH 1/4] add new extractions --- parser/declaration/declaration.go | 4 ++ parser/extract/basics.go | 61 +++++++++++++++++++++++++++++++ parser/extract/extractor.go | 15 +++++--- parser/extract/parser.go | 9 ++++- 4 files changed, 83 insertions(+), 6 deletions(-) diff --git a/parser/declaration/declaration.go b/parser/declaration/declaration.go index f497e4b..a71bc56 100644 --- a/parser/declaration/declaration.go +++ b/parser/declaration/declaration.go @@ -15,6 +15,10 @@ type Declaration struct { Instituciones []*Job `json:"instituciones"` + Recepcion time.Time `json:"recepcion"` + Descarga time.Time `json:"descarga"` + Version string `json:"version"` + // Activos Deposits []*Deposit `json:"depositos"` Debtors []*Debtor `json:"deudores"` diff --git a/parser/extract/basics.go b/parser/extract/basics.go index 4652640..a45b727 100644 --- a/parser/extract/basics.go +++ b/parser/extract/basics.go @@ -80,3 +80,64 @@ func Lastname(e *Extractor) (string, error) { } return value, nil } + +func ReceptionDate(e *Extractor) (time.Time, error) { + var date string + + e.BindFlag(EXTRACTOR_FLAG_1) + if e.MoveUntilContains(PrevToken, "RECEPCIONADO") && + isBarCode(e.CurrToken) && + isDate(e.NextToken) { + date = e.NextToken + } + + if date == "" { + return time.Time{}, errors.New("failed when extracting reception date") + } + + t, err := time.Parse("02/01/2006", date) + if err != nil { + return time.Time{}, errors.New("Error parsing " + date + err.Error()) + } + return t, nil +} + +func DownloadDate(e *Extractor) (time.Time, error) { + var date string + + e.BindFlag(EXTRACTOR_FLAG_1) + if e.MoveUntilStartWith(NextToken, "página") && + isCurrLine(e.CurrToken, "versión") && + isTimestamp(e.PrevToken) { + date = e.PrevToken + } + + if date == "" { + return time.Time{}, errors.New("failed when extracting download date") + } + + // RFC3339 layout + t, err := time.Parse("02/01/2006 15:04:05", date) + if err != nil { + return time.Time{}, errors.New("Error parsing " + date + err.Error()) + } + return t, nil +} + +func Version(e *Extractor) (string, error) { + var version string + + e.BindFlag(EXTRACTOR_FLAG_1) + if e.MoveUntilStartWith(CurrToken, "versión") { + val, check := isKeyValuePair(e.CurrToken, "versión") + if check { + version = val + } + } + + if version == "" { + return "", errors.New("failed when extracting version") + } + + return version, nil +} diff --git a/parser/extract/extractor.go b/parser/extract/extractor.go index 61faee4..84b9919 100644 --- a/parser/extract/extractor.go +++ b/parser/extract/extractor.go @@ -236,6 +236,11 @@ func isDate(line string) bool { return matched } +func isTimestamp(line string) bool { + matched, _ := regexp.MatchString(`[0-9]{2}/[0-9]{2}/[0-9]{4}\s*[0-9]{2}:[0-9]{2}:[0-9]{2}`, line) + return matched +} + func isAlpha(line string) bool { matched, _ := regexp.MatchString(`[aA-zZ].*$`, line) return matched @@ -251,6 +256,11 @@ func isNumber(line string) bool { return matched } +func isBarCode(line string) bool { + matched, _ := regexp.MatchString(`[0-9]{5,6}-[0-9]{5,7}-[0-9]{1,3}`, line) + return matched +} + func isKeyValuePair(key string, precedence string) (string, bool) { r := strings.NewReplacer(":", "") inline := strings.Split(r.Replace(key), precedence) @@ -389,8 +399,3 @@ func stringToYear(line string) int { return year } - -func isBarCode(line string) bool { - matched, _ := regexp.MatchString(`[0-9]{5,6}-[0-9]{5,7}-[0-9]{1,3}`, line) - return matched -} diff --git a/parser/extract/parser.go b/parser/extract/parser.go index 9498d34..38f565b 100644 --- a/parser/extract/parser.go +++ b/parser/extract/parser.go @@ -98,8 +98,15 @@ func ParsePDF(file io.Reader) ParserData { d := &declaration.Declaration{} - // Basic Info. + // Header data d.Fecha = parser.check(Date(NewExtractor(res.Body))) + d.Recepcion = parser.check(ReceptionDate(NewExtractor(res.Body))) + + // Footer data + d.Descarga = parser.check(DownloadDate(NewExtractor(res.Body))) + d.Version = parser.checkStr(Version(NewExtractor(res.Body))) + + // Basic Info. d.Cedula = parser.checkInt(Cedula(NewExtractor(res.Body))) d.Nombre = parser.checkStr(Name(NewExtractor(res.Body))) d.Apellido = parser.checkStr(Lastname(NewExtractor(res.Body))) From eba0cf28616b4ddd6b003419a68aea4afbacc160 Mon Sep 17 00:00:00 2001 From: Ravf95 Date: Fri, 26 Mar 2021 18:39:27 -0300 Subject: [PATCH 2/4] adjust to flag 1 --- parser/extract/basics.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parser/extract/basics.go b/parser/extract/basics.go index a45b727..14685aa 100644 --- a/parser/extract/basics.go +++ b/parser/extract/basics.go @@ -85,7 +85,7 @@ func ReceptionDate(e *Extractor) (time.Time, error) { var date string e.BindFlag(EXTRACTOR_FLAG_1) - if e.MoveUntilContains(PrevToken, "RECEPCIONADO") && + if e.MoveUntilStartWith(PrevToken, "RECEPCIONADO") && isBarCode(e.CurrToken) && isDate(e.NextToken) { date = e.NextToken From fdcf7fc9111ba316ccdc894c7ba9c09afe39be2d Mon Sep 17 00:00:00 2001 From: Ravf95 Date: Mon, 29 Mar 2021 20:35:09 -0400 Subject: [PATCH 3/4] improve job extraction --- parser/extract/extractor.go | 26 ++++++++++++++++++++++++-- parser/extract/jobs.go | 13 +++++++++++-- 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/parser/extract/extractor.go b/parser/extract/extractor.go index 84b9919..aa7fdb4 100644 --- a/parser/extract/extractor.go +++ b/parser/extract/extractor.go @@ -20,6 +20,7 @@ type Extractor struct { SavedLine int Buffer []string + BufferHookFunc func(tokens []string) []string Flags ExtractorFlag } @@ -50,6 +51,9 @@ func NewExtractor(raw string) *Extractor { return &Extractor{ RawData: raw, Scanner: bufio.NewScanner(strings.NewReader(raw)), + BufferHookFunc: func(tokens []string) []string { + return tokens + }, } } @@ -79,7 +83,7 @@ func (e *Extractor) Scan() bool { if e.Flags & EXTRACTOR_FLAG_3 != 0 && text != "" { - e.Buffer = tokenize(text, 3) + e.Buffer = e.BufferHookFunc(tokenize(text, 3)) text = e.Buffer[0] } @@ -171,7 +175,7 @@ func (e *Extractor) UnbindFlag(flag ExtractorFlag) { e.Flags &= flag } -func (e *Extractor) UnbindAllFlags(flag ExtractorFlag) { +func (e *Extractor) UnbindAllFlags() { e.Flags = 0 } @@ -336,6 +340,24 @@ func tokenize(line string, max int) []string { return tokens } +// join two or more array string if one of them contains the sep string +func combine(tokens []string, sep string) []string { + var result []string + length := len(tokens) + + for i := 0; i < length; i++ { + if i + 1 < length && + strings.Contains(tokens[i], sep) && + !strings.Contains(tokens[i+1], sep) { + result = append(result, tokens[i] + tokens[i+1]) + i += 1 + continue + } + result = append(result, tokens[i]) + } + return result +} + /* legacy code support don't use these functions diff --git a/parser/extract/jobs.go b/parser/extract/jobs.go index 7a0871c..7e5fcf6 100644 --- a/parser/extract/jobs.go +++ b/parser/extract/jobs.go @@ -20,6 +20,12 @@ func Jobs(e *Extractor, parser *ParserData) []*declaration.Job { job := &declaration.Job{ } + /* tokenize the read text line and combine + the tokens that are separed by the ':' symbol */ + e.BufferHookFunc = func(tokens []string) []string { + return combine(tokens, ":") + } + if counter > 0 && e.MoveUntilStartWith(CurrToken, "DATOS LABORALES") { @@ -85,9 +91,12 @@ func getJobTitle(e *Extractor) string { func getJobInst(e *Extractor) string { - if strings.Contains(e.PrevToken, "INSTITUCIÓN") && + if strings.Contains(e.CurrToken, "INSTITUCIÓN") && strings.Contains(e.NextToken, "ACTO ADM. COM") { - return e.CurrToken + val, check := isKeyValuePair(e.CurrToken, "INSTITUCIÓN") + if check { + return val + } } if strings.Contains(e.PrevToken, "DIRECCIÓN") && From 575d1903d532f41d54af767360ce59d5a854bd79 Mon Sep 17 00:00:00 2001 From: Ravf95 Date: Mon, 12 Apr 2021 13:17:55 -0400 Subject: [PATCH 4/4] improvements --- parser/extract/basics.go | 30 ++++++++++++++++++++++-------- parser/extract/extractor.go | 34 ++++++++++++++++++++++++++++++++++ parser/extract/spouse.go | 6 ++++-- 3 files changed, 60 insertions(+), 10 deletions(-) diff --git a/parser/extract/basics.go b/parser/extract/basics.go index 14685aa..ebdfe85 100644 --- a/parser/extract/basics.go +++ b/parser/extract/basics.go @@ -12,7 +12,8 @@ func Date(e *Extractor) (time.Time, error) { e.BindFlag(EXTRACTOR_FLAG_1) if e.MoveUntilContains(CurrToken, "DECLARACIÓN") { for e.Scan() { - if isDate(e.CurrToken) { + if isDate(e.CurrToken) && + len(e.CurrToken) <= 10 { // not substring (%dd/%mm/%yyyy) date = e.CurrToken break } @@ -85,10 +86,18 @@ func ReceptionDate(e *Extractor) (time.Time, error) { var date string e.BindFlag(EXTRACTOR_FLAG_1) - if e.MoveUntilStartWith(PrevToken, "RECEPCIONADO") && - isBarCode(e.CurrToken) && - isDate(e.NextToken) { - date = e.NextToken + if e.MoveUntilStartWith(PrevToken, "RECEPCIONADO") { + val, check := isKeyValuePair(e.PrevToken, "RECEPCIONADO") + if check && + isDate(val) { + date = getDate(val) + } + + if date == "" && + isDate(e.NextToken) && + isBarCode(removeSpaces(e.CurrToken)) { + date = getDate(e.NextToken) + } } if date == "" { @@ -107,9 +116,14 @@ func DownloadDate(e *Extractor) (time.Time, error) { e.BindFlag(EXTRACTOR_FLAG_1) if e.MoveUntilStartWith(NextToken, "página") && - isCurrLine(e.CurrToken, "versión") && - isTimestamp(e.PrevToken) { - date = e.PrevToken + isCurrLine(e.CurrToken, "versión") { + val := getDate(e.PrevToken) + val += " " + val += getTime(e.PrevToken) + + if isTimestamp(val) { + date = val + } } if date == "" { diff --git a/parser/extract/extractor.go b/parser/extract/extractor.go index aa7fdb4..973a09c 100644 --- a/parser/extract/extractor.go +++ b/parser/extract/extractor.go @@ -312,6 +312,10 @@ func removeAccents(s string) string { return r.Replace(s) } +func removeSpaces(s string) string { + return strings.ReplaceAll(s, " ", "") +} + // split a line into words that not exceed the max continuous spaces func tokenize(line string, max int) []string { var tokens []string @@ -358,6 +362,36 @@ func combine(tokens []string, sep string) []string { return result } +func getTime(data string) string { + re := regexp.MustCompile(`[0-9]{2}`) + values := re.FindAllString(data, -1) + length := len(values) + + if length < 3 { + return "" + } + + return fmt.Sprintf("%s:%s:%s", values[length -3], values[length -2], values[length -1]) +} + +func getDate(data string) string { + re := regexp.MustCompile(`[0-9]{2}.[0-9]{2}.[0-9]{4}`) + result := re.FindString(data) + + if result == "" { + return "" + } + + re = regexp.MustCompile(`[0-9]{2,4}`) + values := re.FindAllString(result, -1) + + if len(values) < 3 { + return "" + } + + return fmt.Sprintf("%s/%s/%s", values[0], values[1], values[2]) +} + /* legacy code support don't use these functions diff --git a/parser/extract/spouse.go b/parser/extract/spouse.go index 1a283e9..333dc27 100644 --- a/parser/extract/spouse.go +++ b/parser/extract/spouse.go @@ -25,7 +25,8 @@ func Spouse(e *Extractor) (string, error) { return "", err } - if fname != "" { + if fname != "" && + len(fname) <= 32 { spouse += fname continue } @@ -46,7 +47,8 @@ func Spouse(e *Extractor) (string, error) { return spouse, err } - if lname != "" { + if lname != "" && + len(lname) <= 32 { spouse = spouse + " " + lname break }