diff --git a/parser/declaration/declaration.go b/parser/declaration/declaration.go index f497e4b..9b54ddb 100644 --- a/parser/declaration/declaration.go +++ b/parser/declaration/declaration.go @@ -16,6 +16,7 @@ type Declaration struct { Instituciones []*Job `json:"instituciones"` // Activos + NetCash int64 `json:"efectivoEnGs."` Deposits []*Deposit `json:"depositos"` Debtors []*Debtor `json:"deudores"` RealStates []*RealState `json:"inmuebles"` @@ -26,6 +27,7 @@ type Declaration struct { Debts []*Debt `json:"deudas"` + IncomeMonthly int64 `json:"ingresosMensual"` IncomeAnnual int64 `json:"ingresosAnual"` ExpensesMonthly int64 `json:"egresosMensual"` @@ -169,5 +171,7 @@ func (d *Declaration) AddAssets() int64 { total += v.Importe } + total += d.NetCash + return total } diff --git a/parser/extract/asset.go b/parser/extract/asset.go deleted file mode 100644 index d834737..0000000 --- a/parser/extract/asset.go +++ /dev/null @@ -1,197 +0,0 @@ -package extract - -import ( - "bufio" - "github.com/pkg/errors" - "strconv" - "strings" - - "github.com/InstIDEA/ddjj/parser/declaration" -) - -var totalAssets int64 - -var assetsItemNumber int - -var skipAssets = []string{ - "#", - "DESCRIPCIÓN", - "EMPRESA", - "RUC", - "PAÍS", - "CANT.", - "PRECIO UNI.", - "IMPORTE", -} - -// Assets returns other assets owned by the official. -func Assets(scanner *bufio.Scanner) ([]*declaration.OtherAsset, error) { - scanner = MoveUntil(scanner, "1.9 OTROS ACTIVOS", true) - - // Also wants to skip item number - assetsItemNumber = 1 - skipAssets = append(skipAssets, strconv.Itoa(assetsItemNumber)) - - var assets []*declaration.OtherAsset - - values, nextPage := getAssetValues(scanner, 0, false) - for values[0] != "" { - asset := getAsset(scanner, values) - assets = append(assets, asset...) - - if nextPage { - assetsItemNumber = 1 - } else { - assetsItemNumber++ - } - // Also wants to skip item number - skipAssets[len(skipAssets)-1] = strconv.Itoa(assetsItemNumber) - - values, nextPage = getAssetValues(scanner, 0, false) - } - - total := addAssets(assets) - if total == 0 { - return nil, errors.New("failed when extracting other assets") - } - - if total != totalAssets { - return nil, errors.New("other assets do not match") - } - - // Reset variables for next call. - totalAssets = 0 - assetsItemNumber = 0 - - return assets, nil -} - -func getAssetValues(scanner *bufio.Scanner, index int, remaining bool) (values [7]string, nextPage bool) { - line, _ := getAssetLine(scanner) - for line != "" { - - values[index] = line - - // After reading all the possible values for a single item. - if index == 6 { - return - } - - index++ - - line, nextPage = getAssetLine(scanner) - } - - if remaining { - return - } - - return [7]string{}, false -} - -func getAsset(scanner *bufio.Scanner, values [7]string) []*declaration.OtherAsset { - // En algunos casos, el importe del primer activo está al final de la lista - // de activos. Por ejemplo Juan Afara 2014 - if !isNumber(values[6]) { - return getAsset2(scanner, values) - } - - return []*declaration.OtherAsset{getAsset1(values)} -} - -func getAsset1(values [7]string) *declaration.OtherAsset { - return &declaration.OtherAsset{ - Descripcion: values[0], - Empresa: values[1], - RUC: values[2], - Pais: values[3], - Cantidad: stringToInt64(values[4]), - Precio: stringToInt64(values[5]), - Importe: stringToInt64(values[6]), - } -} - -func getAsset2(scanner *bufio.Scanner, values [7]string) []*declaration.OtherAsset { - assets := []*declaration.OtherAsset{} - - firstAsset := getAsset1(values) - assets = append(assets, firstAsset) - - assetsItemNumber++ - skipAssets = append(skipAssets, strconv.Itoa(assetsItemNumber)) - - // values[6] is the descripcion in the second element. - tmp := values[6] - values, _ = getAssetValues(scanner, 1, false) - values[0] = tmp - secondAsset := getAsset1(values) - assets = append(assets, secondAsset) - - // Skip next item number. - assetsItemNumber++ - skipAssets = append(skipAssets, strconv.Itoa(assetsItemNumber)) - - values, nextPage := getAssetValues(scanner, 0, true) - counter := 0 - for values[1] != "" && !nextPage { - assets = append(assets, getAsset1(values)) - - assetsItemNumber++ - skipAssets = append(skipAssets, strconv.Itoa(assetsItemNumber)) - counter++ - - values, nextPage = getAssetValues(scanner, 0, true) - } - - // The last value is the importe for the first item. - firstAsset.Importe = stringToInt64(values[0]) - - // Restore skip assets to default state. The caller would remove the other - // remaining value. - skipAssets = skipAssets[:len(skipAssets)-counter-2] - assetsItemNumber = 1 - - return assets -} - -func getAssetLine(scanner *bufio.Scanner) (line string, nextPage bool) { - for scanner.Scan() { - line = scanner.Text() - - // Stop looking for assets when this is found. - if line == "TOTAL OTROS ACTIVOS" { - totalAssets = getTotalInCategory(scanner) - - // Next page or end. - scanner = MoveUntil(scanner, "TIPO MUEBLES", true) - line = scanner.Text() - nextPage = true - - assetsItemNumber = 1 - skipAssets[len(skipAssets)-1] = strconv.Itoa(assetsItemNumber) - } - - if strings.Contains(line, "OBS:") || strings.Contains(line, "RECEPCIONADO EL:") { - continue - } - if isDate(line) || isBarCode(line) { - continue - } - if line == "" || contains(skipAssets, line) { - continue - } - - return line, nextPage - } - - return "", false -} - -func addAssets(assets []*declaration.OtherAsset) int64 { - var total int64 - for _, a := range assets { - total += a.Importe - } - - return total -} diff --git a/parser/extract/cash.go b/parser/extract/cash.go new file mode 100644 index 0000000..37e1b23 --- /dev/null +++ b/parser/extract/cash.go @@ -0,0 +1,24 @@ +package extract + +import ( + "strings" +) + +func Cash(e *Extractor, parser *ParserData) int64 { + e.BindFlag(EXTRACTOR_FLAG_3) + + if e.MoveUntilContains(CurrToken, "1. ACTIVOS"){ + for e.Scan(){ + if strings.Contains(e.CurrToken, "1.1 EFECTIVO EN GS."){ + if isNumber(e.NextToken) { + return StringToInt64(e.NextToken) + } + + } + + } + } + + return 0 + +} diff --git a/parser/extract/otherAssets.go b/parser/extract/otherAssets.go new file mode 100644 index 0000000..a316856 --- /dev/null +++ b/parser/extract/otherAssets.go @@ -0,0 +1,143 @@ +package extract + +import ( + "strings" + "github.com/pkg/errors" + "github.com/InstIDEA/ddjj/parser/declaration" +) + +func otherAssets(e *Extractor, parser *ParserData) ([]*declaration.OtherAsset, error){ + + e.BindFlag(EXTRACTOR_FLAG_1) + e.BindFlag(EXTRACTOR_FLAG_2) + + var assets []*declaration.OtherAsset + assets = countAssets(e, assets) + + total := addAssets(assets) + if total == 0 { + return nil, errors.New("failed when extracting other assets") + } + + return assets, nil +} + +func countAssets(e *Extractor, assets []*declaration.OtherAsset) []*declaration.OtherAsset { + asset := &declaration.OtherAsset{ } + for e.Scan() { + if strings.Contains(e.CurrToken, "ACCIONES"){ + //Cuando el nombre de la empresa tiene dos lineas, queda en PrevToken, sino queda OBS N/A o el form field y es el caso "normal" + if strings.Contains(e.PrevToken, "OBS: N/A") || isAssetFormField(e.PrevToken){ + values := tokenize(e.CurrToken, 5) + //asset is added only if it has all of the needed values + if len(values) == 8 { + asset = getAsset3(values) + assets = append(assets, asset) + } else { + continue + } + }else{ + //Cuando hay dos lineas, la primera linea queda en PrevToken, en CurrToken queda el indice y ACCIONES y en NextToken la segunda linea del nombre + //entonces se concatenan PrevToken y NextToken, y luego se vuelve a scanear para tener el resto de los datos + //Tambien se tiene el caso en el que el nombre se encuentra una linea mas arriba a pesar de no tener dos lineas (Ejemplo: Cartes 2018, Consignataria de Ganado S.A>) + var name string + if !strings.Contains(e.NextToken, "OBS: N/A"){ + name = e.PrevToken + " " + e.NextToken + } + for i := 1; i < 3; i++ { + e.Scan() + } + //aditional values that are not in the line but are needed to have the full asset, included the name + additional := []string{"#","ACCIONES", name} + values := append(additional, tokenize(e.CurrToken, 4)...) + if len(values) == 8 { + asset = getAsset3(values) + assets = append(assets, asset) + } else { + continue + } + + } + } else if strings.Contains(e.CurrToken, "CERTIFICADO DE DEPOSITOS DE"){ + //subsequent scans are needed due to the document format + for i := 1; i < 4; i++ { + e.Scan() + } + //fixed values that are not in the line but are needed to have the full asset + fixed := []string{"#","CERTIFICADO DE DEPOSITOS DE AHORROS"} + values := append(fixed, tokenize(e.CurrToken, 4)...) + asset = getAsset3(values) + assets = append(assets, asset) + } else if strings.Contains(e.CurrToken, "INVERSIONES") || strings.Contains(e.CurrToken, "BONOS") || strings.Contains(e.CurrToken, "PATENTES") || (strings.Contains(e.CurrToken, "OTROS") && strings.Contains(e.NextToken, "OBS: N/A")){ + values := tokenize(e.CurrToken, 5) + if len(values) == 8 { + asset = getAsset3(values) + assets = append(assets, asset) + } else { + continue + } + }else{ + continue + } + } + + return assets +} + +/* +Function to check if a given string is or not the header of the section. +Parameter: string s +Return: True or false +*/ + +func isAssetFormField(s string) bool { + formField := []string { + "#", + "DESCRIPCION", + "EMPRESA", + "RUC", + "PAIS", + "CANT.", + "PRECIO UNI.", + "IMPORTE", + } + + s = removeAccents(s) + for _, value := range formField { + if isCurrLine(s, value) { + return true + } + } + + return false +} + +/* +Function to load the extracted values into the OtherAsset structure. +Parameters: values in an array of strings. The first element is not inserted because it is the index and not relevant. +Return: an instance of OtherAsset with the values from the array +*/ + +func getAsset3(values []string) *declaration.OtherAsset { + return &declaration.OtherAsset{ + Descripcion: values[1], + Empresa: values[2], + RUC: values[3], + Pais: values[4], + Cantidad: stringToInt64(values[5]), + Precio: stringToInt64(values[6]), + Importe: stringToInt64(values[7]), + } +} + +/* +Function to calculate the total of the extracted assets. +*/ + +func addAssets(assets []*declaration.OtherAsset) int64 { + var total int64 + for _, a := range assets { + total += a.Importe + } + return total +} \ No newline at end of file diff --git a/parser/extract/parser.go b/parser/extract/parser.go index 9498d34..19d7a1c 100644 --- a/parser/extract/parser.go +++ b/parser/extract/parser.go @@ -110,6 +110,9 @@ func ParsePDF(file io.Reader) ParserData { // Jobs d.Instituciones = Jobs(NewExtractor(pl_res.Body), &parser) + // Cash + d.NetCash = Cash(NewExtractor(pl_res.Body), &parser) + // Deposits scanner := bufio.NewScanner(strings.NewReader(res.Body)) d.Deposits, err = Deposits(scanner) @@ -149,12 +152,11 @@ func ParsePDF(file io.Reader) ParserData { parser.addError(err) } - // Other assets - scanner = bufio.NewScanner(strings.NewReader(res.Body)) - d.OtherAssets, err = Assets(scanner) + //Other Assets + d.OtherAssets, err = otherAssets(NewExtractor(pl_res.Body), &parser) if err != nil { parser.addError(err) - } + } // Debts scanner = bufio.NewScanner(strings.NewReader(res.Body)) diff --git a/parser/test_declarations/41567_JUAN_EUDES_AFARA_MACIEL.pdf b/parser/test_declarations/41567_JUAN_EUDES_AFARA_MACIEL.pdf new file mode 100644 index 0000000..4406ed8 Binary files /dev/null and b/parser/test_declarations/41567_JUAN_EUDES_AFARA_MACIEL.pdf differ diff --git a/parser/test_declarations/555501_FIDEL_SANTIAGO_ZAVALA_SERRATI.pdf b/parser/test_declarations/555501_FIDEL_SANTIAGO_ZAVALA_SERRATI.pdf new file mode 100644 index 0000000..24b2400 Binary files /dev/null and b/parser/test_declarations/555501_FIDEL_SANTIAGO_ZAVALA_SERRATI.pdf differ diff --git a/parser/test_declarations/581842_HORACIO_MANUEL_CARTES_JARA.pdf b/parser/test_declarations/581842_HORACIO_MANUEL_CARTES_JARA.pdf new file mode 100644 index 0000000..cbf4f20 Binary files /dev/null and b/parser/test_declarations/581842_HORACIO_MANUEL_CARTES_JARA.pdf differ diff --git a/parser/test_declarations/826725_LILIAN_GRACIELA_SAMANIEGO_GONZALEZ.pdf b/parser/test_declarations/826725_LILIAN_GRACIELA_SAMANIEGO_GONZALEZ.pdf new file mode 100644 index 0000000..bf03e74 Binary files /dev/null and b/parser/test_declarations/826725_LILIAN_GRACIELA_SAMANIEGO_GONZALEZ.pdf differ