diff --git a/parser/extract/debtors.go b/parser/extract/debtors.go index b27443d..7a8ffdc 100644 --- a/parser/extract/debtors.go +++ b/parser/extract/debtors.go @@ -1,122 +1,97 @@ package extract import ( - "bufio" - "errors" - "strconv" + "fmt" "strings" "github.com/InstIDEA/ddjj/parser/declaration" ) // Debtors returns the debts people have with the official. -func Debtors(scanner *bufio.Scanner) ([]*declaration.Debtor, error) { - var skip = []string{ - "#", - "NOMBRE DEL DEUDOR", - "CLASE (A LA VISTA O PLAZOS)", - "PLAZO EN", - "IMPORTE", - } - - scanner = MoveUntil(scanner, "1.3 CUENTAS A COBRAR", true) - - var debtors []*declaration.Debtor - opts := &debtorOpts{ - debtor: &declaration.Debtor{}, - counter: 0, - } - - index := 1 - skip = append(skip, strconv.Itoa(index)) - var total int64 - for scanner.Scan() { - line := scanner.Text() - - // Stop looking for debtors when this is found. - if line == "TOTAL CUENTAS POR COBRAR:" { - total = getTotalInCategory(scanner) - - // Next page or end. - scanner = MoveUntil(scanner, "NOMBRE DEL DEUDOR", true) - line = scanner.Text() - if line == "" { - break +func Debtors(e *Extractor, parser *ParserData) ([]*declaration.Debtor, error) { + var debtors []*declaration.Debtor //lsit of extracted debtors + debt := &declaration.Debtor{} //aux for the actual extraction + e.BindFlag(EXTRACTOR_FLAG_1) //remueve las lineas en blanco + e.BindFlag(EXTRACTOR_FLAG_2) //remueve los espacios en los extremos + //EXTRACTOR_FLAG_3 crea nuevos tokens siempre que dentro de la linea haya mas o igual a 3 espacios + var bandera bool + bandera = false + counter := 0 + successful := 0 + if e.MoveUntilStartWith(CurrToken, "1.3 CUENTAS A COBRAR") { + for e.Scan() { + // other assets table header and OBS are omitted + if isAssetFormField(e.CurrToken) { + bandera = true //we are in the table records because we have the header + continue } - - index = 1 - } - - if strings.Contains(line, "OBS:") { - continue - } - if contains(skip, line) || line == "" { - if line == strconv.Itoa(index) { - // Delete the index to avoid confusion with Plazo. - skip = skip[:len(skip)-1] + if strings.Contains(e.CurrToken, "OBS:") && bandera { + counter++ + continue + } + // final of others assets of current page + if strings.Contains(e.CurrToken, "TOTAL CUENTAS POR COBRAR:") { + bandera = false + } + //if the ban it's true, we can proceed with the extraction + if bandera { + values := tokenize(e.CurrToken, 3) + if len(values) == 5 { + debt = detDebtor(values) + debtors = append(debtors, debt) + } } - continue - } - - d := getDebtor(opts, line) - if d != nil { - debtors = append(debtors, d) - opts.counter = -1 - opts.debtor = &declaration.Debtor{} - - // Skip the following item #. - index++ - skip[len(skip)-1] = strconv.Itoa(index) } - - opts.counter++ + successful = len(debtors) } - - totalDebtors := addDebtors(debtors) - - if total == 0 { - return nil, errors.New("failed when extracting debtors") + if successful != counter { + parser.addMessage(fmt.Sprintf("ignored debtors: %d/%d", counter-successful, counter)) } - if totalDebtors != total { - return nil, errors.New("debtors do not match") + if debtors == nil { + parser.addError(fmt.Errorf("failed when extracting debtors")) + return nil, nil } return debtors, nil } -type debtorOpts struct { - debtor *declaration.Debtor - counter int -} +/* +Function to check if a given string is or not the header of the section. +Parameter: string s +Return: True or false +*/ -func getDebtor(opts *debtorOpts, line string) *declaration.Debtor { - switch opts.counter { - case 0: - opts.debtor.Nombre = line - break - case 1: - opts.debtor.Clase = line - break - case 2: - value, _ := strconv.Atoi(line) - opts.debtor.Plazo = value - break - case 3: - value := strings.ReplaceAll(line, ".", "") - i, _ := strconv.ParseInt(value, 10, 64) - opts.debtor.Importe = i - return opts.debtor +func isAssetFormField(s string) bool { + formField := []string{ + "#", + "NOMBRE DEL DEUDOR", + "CLASE (A LA VISTA O PLAZOS)", + "PLAZO EN", + "IMPORTE", } - return nil + s = removeAccents(s) + for _, value := range formField { + if !strings.Contains(s, value) { + return false + } + } + + return true } -func addDebtors(debtors []*declaration.Debtor) int64 { - var total int64 - for _, d := range debtors { - total += d.Importe +/* +Function to load the extracted values into the OtherAsset structure. +Parameters: values in an array of strings. The first element is not inserted because it is the index and not relevant. +Return: an instance of OtherAsset with the values from the array +*/ + +func detDebtor(values []string) *declaration.Debtor { + return &declaration.Debtor{ + Nombre: values[1], + Clase: values[2], + Plazo: stringToInt(values[3]), + Importe: stringToInt64(values[4]), } - - return total } diff --git a/parser/extract/parser.go b/parser/extract/parser.go index 0e32788..d1c548c 100644 --- a/parser/extract/parser.go +++ b/parser/extract/parser.go @@ -2,13 +2,14 @@ package extract import ( "bufio" - "code.sajari.com/docconv" "encoding/json" "fmt" - "github.com/InstIDEA/ddjj/parser/declaration" "io" "strings" "time" + + "code.sajari.com/docconv" + "github.com/InstIDEA/ddjj/parser/declaration" ) type ParserData struct { @@ -124,11 +125,8 @@ func ParsePDF(file io.Reader) ParserData { } // Debtors. - scanner = bufio.NewScanner(strings.NewReader(res.Body)) - d.Debtors, err = Debtors(scanner) - if err != nil { - parser.addError(err) - } + scanner = bufio.NewScanner(strings.NewReader(pl_res.Body)) + d.Debtors, err = Debtors(NewExtractor(pl_res.Body), &parser) // Real state. scanner = bufio.NewScanner(strings.NewReader(res.Body)) diff --git a/parser/main_test.go b/parser/main_test.go index e21317a..4ca84ea 100644 --- a/parser/main_test.go +++ b/parser/main_test.go @@ -2,9 +2,10 @@ package main import ( "fmt" - "github.com/InstIDEA/ddjj/parser/extract" "reflect" "testing" + + "github.com/InstIDEA/ddjj/parser/extract" ) func TestDarioRamon(t *testing.T) { @@ -194,6 +195,38 @@ func TestNataliaDure2019(t *testing.T) { AssertEqual(t, "2019-03-07", data.Data.Fecha.Format("2006-01-02")) } +func TestEddyNeufeld2016(t *testing.T) { + + data := handleSingleFile("./test_declarations/2024982_9fb18b249891f3e2f290e33e588d98b1.pdf") + + if data.Data == nil { + t.Errorf("Error parsing the document") + } + + for _, item := range data.Message { + fmt.Println(item) + } + + fmt.Printf("\n\n") + fmt.Println("Nombre: ", data.Data.Nombre) + fmt.Println("Fecha: ", data.Data.Fecha) + fmt.Println("Conyuge: ", data.Data.Conyuge) + fmt.Println("Cargo: ", data.Data.Instituciones[0].Cargo) + fmt.Println("Institucion: ", data.Data.Instituciones[0].Institucion) + fmt.Println("Resumen Activos: ", data.Data.Resumen.TotalActivo) + fmt.Println("Resumen Pasivos: ", data.Data.Resumen.TotalPasivo) + fmt.Println("Resumen Patrimonio Neto: ", data.Data.Resumen.PatrimonioNeto) + + AssertEqual(t, "EDDY", data.Data.Nombre) + AssertEqual(t, "2016-01-04", data.Data.Fecha.Format("2006-01-02")) + AssertEqual(t, "INTENDENTE MUNICIPAL", data.Data.Instituciones[0].Cargo) + AssertEqual(t, "MUNICIPALIDAD DE RAUL ARSENIO OVIEDO", data.Data.Instituciones[0].Institucion) + AssertEqual(t, "MIRNA ELIZABETH FLORENCIAƑEZ NEUFELD", data.Data.Conyuge) + AssertEqual(t, int64(108601862791), data.Data.Resumen.TotalActivo) + AssertEqual(t, int64(38970873094), data.Data.Resumen.TotalPasivo) + AssertEqual(t, int64(69630989697), data.Data.Resumen.PatrimonioNeto) +} + // AssertEqual checks if values are equal func AssertEqual(t *testing.T, want interface{}, got interface{}) { if want == got { diff --git a/parser/test_declarations/2024982_9fb18b249891f3e2f290e33e588d98b1.pdf b/parser/test_declarations/2024982_9fb18b249891f3e2f290e33e588d98b1.pdf new file mode 100644 index 0000000..8cbda30 Binary files /dev/null and b/parser/test_declarations/2024982_9fb18b249891f3e2f290e33e588d98b1.pdf differ