Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions parser/declaration/declaration.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ type Declaration struct {

Instituciones []*Job `json:"instituciones"`

Recepcion time.Time `json:"recepcion"`
Descarga time.Time `json:"descarga"`
Version string `json:"version"`

// Activos
Deposits []*Deposit `json:"depositos"`
Debtors []*Debtor `json:"deudores"`
Expand Down
77 changes: 76 additions & 1 deletion parser/extract/basics.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ func Date(e *Extractor) (time.Time, error) {
e.BindFlag(EXTRACTOR_FLAG_1)
if e.MoveUntilContains(CurrToken, "DECLARACIÓN") {
for e.Scan() {
if isDate(e.CurrToken) {
if isDate(e.CurrToken) &&
len(e.CurrToken) <= 10 { // not substring (%dd/%mm/%yyyy)
date = e.CurrToken
break
}
Expand Down Expand Up @@ -80,3 +81,77 @@ func Lastname(e *Extractor) (string, error) {
}
return value, nil
}

func ReceptionDate(e *Extractor) (time.Time, error) {
var date string

e.BindFlag(EXTRACTOR_FLAG_1)
if e.MoveUntilStartWith(PrevToken, "RECEPCIONADO") {
val, check := isKeyValuePair(e.PrevToken, "RECEPCIONADO")
if check &&
isDate(val) {
date = getDate(val)
}

if date == "" &&
isDate(e.NextToken) &&
isBarCode(removeSpaces(e.CurrToken)) {
date = getDate(e.NextToken)
}
}

if date == "" {
return time.Time{}, errors.New("failed when extracting reception date")
}

t, err := time.Parse("02/01/2006", date)
if err != nil {
return time.Time{}, errors.New("Error parsing " + date + err.Error())
}
return t, nil
}

func DownloadDate(e *Extractor) (time.Time, error) {
var date string

e.BindFlag(EXTRACTOR_FLAG_1)
if e.MoveUntilStartWith(NextToken, "página") &&
isCurrLine(e.CurrToken, "versión") {
val := getDate(e.PrevToken)
val += " "
val += getTime(e.PrevToken)

if isTimestamp(val) {
date = val
}
}

if date == "" {
return time.Time{}, errors.New("failed when extracting download date")
}

// RFC3339 layout
t, err := time.Parse("02/01/2006 15:04:05", date)
if err != nil {
return time.Time{}, errors.New("Error parsing " + date + err.Error())
}
return t, nil
}

func Version(e *Extractor) (string, error) {
var version string

e.BindFlag(EXTRACTOR_FLAG_1)
if e.MoveUntilStartWith(CurrToken, "versión") {
val, check := isKeyValuePair(e.CurrToken, "versión")
if check {
version = val
}
}

if version == "" {
return "", errors.New("failed when extracting version")
}

return version, nil
}
75 changes: 68 additions & 7 deletions parser/extract/extractor.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ type Extractor struct {
SavedLine int

Buffer []string
BufferHookFunc func(tokens []string) []string

Flags ExtractorFlag
}
Expand Down Expand Up @@ -50,6 +51,9 @@ func NewExtractor(raw string) *Extractor {
return &Extractor{
RawData: raw,
Scanner: bufio.NewScanner(strings.NewReader(raw)),
BufferHookFunc: func(tokens []string) []string {
return tokens
},
}
}

Expand Down Expand Up @@ -79,7 +83,7 @@ func (e *Extractor) Scan() bool {

if e.Flags & EXTRACTOR_FLAG_3 != 0 &&
text != "" {
e.Buffer = tokenize(text, 3)
e.Buffer = e.BufferHookFunc(tokenize(text, 3))
text = e.Buffer[0]
}

Expand Down Expand Up @@ -171,7 +175,7 @@ func (e *Extractor) UnbindFlag(flag ExtractorFlag) {
e.Flags &= flag
}

func (e *Extractor) UnbindAllFlags(flag ExtractorFlag) {
func (e *Extractor) UnbindAllFlags() {
e.Flags = 0
}

Expand Down Expand Up @@ -236,6 +240,11 @@ func isDate(line string) bool {
return matched
}

func isTimestamp(line string) bool {
matched, _ := regexp.MatchString(`[0-9]{2}/[0-9]{2}/[0-9]{4}\s*[0-9]{2}:[0-9]{2}:[0-9]{2}`, line)
return matched
}

func isAlpha(line string) bool {
matched, _ := regexp.MatchString(`[aA-zZ].*$`, line)
return matched
Expand All @@ -251,6 +260,11 @@ func isNumber(line string) bool {
return matched
}

func isBarCode(line string) bool {
matched, _ := regexp.MatchString(`[0-9]{5,6}-[0-9]{5,7}-[0-9]{1,3}`, line)
return matched
}

func isKeyValuePair(key string, precedence string) (string, bool) {
r := strings.NewReplacer(":", "")
inline := strings.Split(r.Replace(key), precedence)
Expand Down Expand Up @@ -298,6 +312,10 @@ func removeAccents(s string) string {
return r.Replace(s)
}

func removeSpaces(s string) string {
return strings.ReplaceAll(s, " ", "")
}

// split a line into words that not exceed the max continuous spaces
func tokenize(line string, max int) []string {
var tokens []string
Expand Down Expand Up @@ -326,6 +344,54 @@ func tokenize(line string, max int) []string {
return tokens
}

// join two or more array string if one of them contains the sep string
func combine(tokens []string, sep string) []string {
var result []string
length := len(tokens)

for i := 0; i < length; i++ {
if i + 1 < length &&
strings.Contains(tokens[i], sep) &&
!strings.Contains(tokens[i+1], sep) {
result = append(result, tokens[i] + tokens[i+1])
i += 1
continue
}
result = append(result, tokens[i])
}
return result
}

func getTime(data string) string {
re := regexp.MustCompile(`[0-9]{2}`)
values := re.FindAllString(data, -1)
length := len(values)

if length < 3 {
return ""
}

return fmt.Sprintf("%s:%s:%s", values[length -3], values[length -2], values[length -1])
}

func getDate(data string) string {
re := regexp.MustCompile(`[0-9]{2}.[0-9]{2}.[0-9]{4}`)
result := re.FindString(data)

if result == "" {
return ""
}

re = regexp.MustCompile(`[0-9]{2,4}`)
values := re.FindAllString(result, -1)

if len(values) < 3 {
return ""
}

return fmt.Sprintf("%s/%s/%s", values[0], values[1], values[2])
}

/*
legacy code support
don't use these functions
Expand Down Expand Up @@ -389,8 +455,3 @@ func stringToYear(line string) int {

return year
}

func isBarCode(line string) bool {
matched, _ := regexp.MatchString(`[0-9]{5,6}-[0-9]{5,7}-[0-9]{1,3}`, line)
return matched
}
13 changes: 11 additions & 2 deletions parser/extract/jobs.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ func Jobs(e *Extractor, parser *ParserData) []*declaration.Job {

job := &declaration.Job{ }

/* tokenize the read text line and combine
the tokens that are separed by the ':' symbol */
e.BufferHookFunc = func(tokens []string) []string {
return combine(tokens, ":")
}

if counter > 0 &&
e.MoveUntilStartWith(CurrToken, "DATOS LABORALES") {

Expand Down Expand Up @@ -85,9 +91,12 @@ func getJobTitle(e *Extractor) string {

func getJobInst(e *Extractor) string {

if strings.Contains(e.PrevToken, "INSTITUCIÓN") &&
if strings.Contains(e.CurrToken, "INSTITUCIÓN") &&
strings.Contains(e.NextToken, "ACTO ADM. COM") {
return e.CurrToken
val, check := isKeyValuePair(e.CurrToken, "INSTITUCIÓN")
if check {
return val
}
}

if strings.Contains(e.PrevToken, "DIRECCIÓN") &&
Expand Down
9 changes: 8 additions & 1 deletion parser/extract/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,15 @@ func ParsePDF(file io.Reader) ParserData {

d := &declaration.Declaration{}

// Basic Info.
// Header data
d.Fecha = parser.check(Date(NewExtractor(res.Body)))
d.Recepcion = parser.check(ReceptionDate(NewExtractor(res.Body)))

// Footer data
d.Descarga = parser.check(DownloadDate(NewExtractor(res.Body)))
d.Version = parser.checkStr(Version(NewExtractor(res.Body)))

// Basic Info.
d.Cedula = parser.checkInt(Cedula(NewExtractor(res.Body)))
d.Nombre = parser.checkStr(Name(NewExtractor(res.Body)))
d.Apellido = parser.checkStr(Lastname(NewExtractor(res.Body)))
Expand Down
6 changes: 4 additions & 2 deletions parser/extract/spouse.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ func Spouse(e *Extractor) (string, error) {
return "", err
}

if fname != "" {
if fname != "" &&
len(fname) <= 32 {
spouse += fname
continue
}
Expand All @@ -46,7 +47,8 @@ func Spouse(e *Extractor) (string, error) {
return spouse, err
}

if lname != "" {
if lname != "" &&
len(lname) <= 32 {
spouse = spouse + " " + lname
break
}
Expand Down