diff --git a/parser/declaration/declaration.go b/parser/declaration/declaration.go index f497e4b..a71bc56 100644 --- a/parser/declaration/declaration.go +++ b/parser/declaration/declaration.go @@ -15,6 +15,10 @@ type Declaration struct { Instituciones []*Job `json:"instituciones"` + Recepcion time.Time `json:"recepcion"` + Descarga time.Time `json:"descarga"` + Version string `json:"version"` + // Activos Deposits []*Deposit `json:"depositos"` Debtors []*Debtor `json:"deudores"` diff --git a/parser/extract/basics.go b/parser/extract/basics.go index 4652640..ebdfe85 100644 --- a/parser/extract/basics.go +++ b/parser/extract/basics.go @@ -12,7 +12,8 @@ func Date(e *Extractor) (time.Time, error) { e.BindFlag(EXTRACTOR_FLAG_1) if e.MoveUntilContains(CurrToken, "DECLARACIÓN") { for e.Scan() { - if isDate(e.CurrToken) { + if isDate(e.CurrToken) && + len(e.CurrToken) <= 10 { // not substring (%dd/%mm/%yyyy) date = e.CurrToken break } @@ -80,3 +81,77 @@ func Lastname(e *Extractor) (string, error) { } return value, nil } + +func ReceptionDate(e *Extractor) (time.Time, error) { + var date string + + e.BindFlag(EXTRACTOR_FLAG_1) + if e.MoveUntilStartWith(PrevToken, "RECEPCIONADO") { + val, check := isKeyValuePair(e.PrevToken, "RECEPCIONADO") + if check && + isDate(val) { + date = getDate(val) + } + + if date == "" && + isDate(e.NextToken) && + isBarCode(removeSpaces(e.CurrToken)) { + date = getDate(e.NextToken) + } + } + + if date == "" { + return time.Time{}, errors.New("failed when extracting reception date") + } + + t, err := time.Parse("02/01/2006", date) + if err != nil { + return time.Time{}, errors.New("Error parsing " + date + err.Error()) + } + return t, nil +} + +func DownloadDate(e *Extractor) (time.Time, error) { + var date string + + e.BindFlag(EXTRACTOR_FLAG_1) + if e.MoveUntilStartWith(NextToken, "página") && + isCurrLine(e.CurrToken, "versión") { + val := getDate(e.PrevToken) + val += " " + val += getTime(e.PrevToken) + + if isTimestamp(val) { + date = val + } + } + + if date == "" { + return time.Time{}, errors.New("failed when extracting download date") + } + + // RFC3339 layout + t, err := time.Parse("02/01/2006 15:04:05", date) + if err != nil { + return time.Time{}, errors.New("Error parsing " + date + err.Error()) + } + return t, nil +} + +func Version(e *Extractor) (string, error) { + var version string + + e.BindFlag(EXTRACTOR_FLAG_1) + if e.MoveUntilStartWith(CurrToken, "versión") { + val, check := isKeyValuePair(e.CurrToken, "versión") + if check { + version = val + } + } + + if version == "" { + return "", errors.New("failed when extracting version") + } + + return version, nil +} diff --git a/parser/extract/extractor.go b/parser/extract/extractor.go index 61faee4..973a09c 100644 --- a/parser/extract/extractor.go +++ b/parser/extract/extractor.go @@ -20,6 +20,7 @@ type Extractor struct { SavedLine int Buffer []string + BufferHookFunc func(tokens []string) []string Flags ExtractorFlag } @@ -50,6 +51,9 @@ func NewExtractor(raw string) *Extractor { return &Extractor{ RawData: raw, Scanner: bufio.NewScanner(strings.NewReader(raw)), + BufferHookFunc: func(tokens []string) []string { + return tokens + }, } } @@ -79,7 +83,7 @@ func (e *Extractor) Scan() bool { if e.Flags & EXTRACTOR_FLAG_3 != 0 && text != "" { - e.Buffer = tokenize(text, 3) + e.Buffer = e.BufferHookFunc(tokenize(text, 3)) text = e.Buffer[0] } @@ -171,7 +175,7 @@ func (e *Extractor) UnbindFlag(flag ExtractorFlag) { e.Flags &= flag } -func (e *Extractor) UnbindAllFlags(flag ExtractorFlag) { +func (e *Extractor) UnbindAllFlags() { e.Flags = 0 } @@ -236,6 +240,11 @@ func isDate(line string) bool { return matched } +func isTimestamp(line string) bool { + matched, _ := regexp.MatchString(`[0-9]{2}/[0-9]{2}/[0-9]{4}\s*[0-9]{2}:[0-9]{2}:[0-9]{2}`, line) + return matched +} + func isAlpha(line string) bool { matched, _ := regexp.MatchString(`[aA-zZ].*$`, line) return matched @@ -251,6 +260,11 @@ func isNumber(line string) bool { return matched } +func isBarCode(line string) bool { + matched, _ := regexp.MatchString(`[0-9]{5,6}-[0-9]{5,7}-[0-9]{1,3}`, line) + return matched +} + func isKeyValuePair(key string, precedence string) (string, bool) { r := strings.NewReplacer(":", "") inline := strings.Split(r.Replace(key), precedence) @@ -298,6 +312,10 @@ func removeAccents(s string) string { return r.Replace(s) } +func removeSpaces(s string) string { + return strings.ReplaceAll(s, " ", "") +} + // split a line into words that not exceed the max continuous spaces func tokenize(line string, max int) []string { var tokens []string @@ -326,6 +344,54 @@ func tokenize(line string, max int) []string { return tokens } +// join two or more array string if one of them contains the sep string +func combine(tokens []string, sep string) []string { + var result []string + length := len(tokens) + + for i := 0; i < length; i++ { + if i + 1 < length && + strings.Contains(tokens[i], sep) && + !strings.Contains(tokens[i+1], sep) { + result = append(result, tokens[i] + tokens[i+1]) + i += 1 + continue + } + result = append(result, tokens[i]) + } + return result +} + +func getTime(data string) string { + re := regexp.MustCompile(`[0-9]{2}`) + values := re.FindAllString(data, -1) + length := len(values) + + if length < 3 { + return "" + } + + return fmt.Sprintf("%s:%s:%s", values[length -3], values[length -2], values[length -1]) +} + +func getDate(data string) string { + re := regexp.MustCompile(`[0-9]{2}.[0-9]{2}.[0-9]{4}`) + result := re.FindString(data) + + if result == "" { + return "" + } + + re = regexp.MustCompile(`[0-9]{2,4}`) + values := re.FindAllString(result, -1) + + if len(values) < 3 { + return "" + } + + return fmt.Sprintf("%s/%s/%s", values[0], values[1], values[2]) +} + /* legacy code support don't use these functions @@ -389,8 +455,3 @@ func stringToYear(line string) int { return year } - -func isBarCode(line string) bool { - matched, _ := regexp.MatchString(`[0-9]{5,6}-[0-9]{5,7}-[0-9]{1,3}`, line) - return matched -} diff --git a/parser/extract/jobs.go b/parser/extract/jobs.go index 7a0871c..7e5fcf6 100644 --- a/parser/extract/jobs.go +++ b/parser/extract/jobs.go @@ -20,6 +20,12 @@ func Jobs(e *Extractor, parser *ParserData) []*declaration.Job { job := &declaration.Job{ } + /* tokenize the read text line and combine + the tokens that are separed by the ':' symbol */ + e.BufferHookFunc = func(tokens []string) []string { + return combine(tokens, ":") + } + if counter > 0 && e.MoveUntilStartWith(CurrToken, "DATOS LABORALES") { @@ -85,9 +91,12 @@ func getJobTitle(e *Extractor) string { func getJobInst(e *Extractor) string { - if strings.Contains(e.PrevToken, "INSTITUCIÓN") && + if strings.Contains(e.CurrToken, "INSTITUCIÓN") && strings.Contains(e.NextToken, "ACTO ADM. COM") { - return e.CurrToken + val, check := isKeyValuePair(e.CurrToken, "INSTITUCIÓN") + if check { + return val + } } if strings.Contains(e.PrevToken, "DIRECCIÓN") && diff --git a/parser/extract/parser.go b/parser/extract/parser.go index 9498d34..38f565b 100644 --- a/parser/extract/parser.go +++ b/parser/extract/parser.go @@ -98,8 +98,15 @@ func ParsePDF(file io.Reader) ParserData { d := &declaration.Declaration{} - // Basic Info. + // Header data d.Fecha = parser.check(Date(NewExtractor(res.Body))) + d.Recepcion = parser.check(ReceptionDate(NewExtractor(res.Body))) + + // Footer data + d.Descarga = parser.check(DownloadDate(NewExtractor(res.Body))) + d.Version = parser.checkStr(Version(NewExtractor(res.Body))) + + // Basic Info. d.Cedula = parser.checkInt(Cedula(NewExtractor(res.Body))) d.Nombre = parser.checkStr(Name(NewExtractor(res.Body))) d.Apellido = parser.checkStr(Lastname(NewExtractor(res.Body))) diff --git a/parser/extract/spouse.go b/parser/extract/spouse.go index 1a283e9..333dc27 100644 --- a/parser/extract/spouse.go +++ b/parser/extract/spouse.go @@ -25,7 +25,8 @@ func Spouse(e *Extractor) (string, error) { return "", err } - if fname != "" { + if fname != "" && + len(fname) <= 32 { spouse += fname continue } @@ -46,7 +47,8 @@ func Spouse(e *Extractor) (string, error) { return spouse, err } - if lname != "" { + if lname != "" && + len(lname) <= 32 { spouse = spouse + " " + lname break }