dataOperations/dfOperations.py at main · andretocci/dataOperations · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
import pandas as pd
import numpy as np
import re
from datetime import datetime, timedelta
from datetime import date
import math
import string


class dfOps:

    def ano_mes_col(df, ano_col_name='ano', mes_col_name='mes'):
        """
        Function to be applied on a Pandas DataFrame to create a YearMonth Column.

        Parameters
        ----------
        ano_col_name : str, optional
            Year column name present on the Data Frame
        mes_col_name : str, optional
            Year column name present on the Data Frame

        Returns
        -------
        Pandas Series
            inplace dataframe with a new str column
        """
        df['ano_mes'] = (df[ano_col_name].apply(str) + df[mes_col_name].apply(lambda mes: str(mes) if mes > 9 else '0' + str(mes))).apply(int)
        return df['ano_mes']

    def try_force_col_types(df, date_match = None, skip_pattern = None,inplace = None, print_progress = None):
        """
        Função que força a conversão das colunas de um pd.DataFrame para número ou data(caso a coluna esteja no padrão 'date|data|Date|Data').
        Importante que as colunas de data possuam o ano primeiro.

        Parameters
        ----------
        df : pd.DataFrame,
            List of date colnames to be converted
        date_match : str,
            Regex pattern to identify date colnames to be converted; (defaut: 'date|data|Date|Data|dia|Dia|day|Day|timestamp')
        skip_pattern : str,
            Regex pattern to identify colnames not to be converted; (defaut: '_id$|^id_|_id_|^ $')
        inplace : bool,
            If function will be applied on the same object;(defaut: False)
        print_progress : bool,
            Print for debug

        Returns
        -------
        Pandas Data Frame
            pd.DataFrame com suas colunas tratadas
        """

        if inplace is None:
            df = df.copy()
        if date_match is None:
            date_match = 'date|data|Date|Data|dia|Dia|day|Day|timestamp'
        if skip_pattern is None:
            skip_pattern = '_id$|^id_|_id_|^ $'
        if print_progress is None:
            print_progress = False

        for coluna in df.columns:
            try:
                if re.search(date_match, coluna):
                    df[coluna] = pd.to_datetime(df[coluna], yearfirst= True)
                else:
                    #Condicional que pula colunas identificadas no skip_pattern
                    if re.search(skip_pattern, coluna):
                        if print_progress:
                            print('[try_force_col_types] Coluna_pulada_skip_pattern: ', coluna)#pulando colunas que possam ser IDs
                        continue
                    df[coluna] = pd.to_numeric(df[coluna])
            except:
                if print_progress:
                    print('[try_force_col_types] Coluna_pulada_tipo: ', coluna)
        return df

    def value_and_date_conv(df, colunas_datas = None, colunas_valor = None):
        """
        Function to be applied on a Pandas DataFrame that iterates over columns to try to convert them.

        Parameters
        ----------
        colunas_datas : list,
            List of date colnames to be converted
        colunas_datas : list,
            List of values colnames to be converted

        Returns
        -------
        Pandas Data Frame
            inplace dataframe with a new column
        """
        if colunas_datas is None:
            colunas_datas = []
        if colunas_valor is None:
            colunas_valor = []

        for col_datas in colunas_datas:
            df[col_datas] = pd.to_datetime(df[col_datas])

        for col_datas, col_valor in zip(colunas_datas, colunas_valor):
            try:
                df[col_valor] = df[col_valor].str.replace('.', '')
                df[col_valor] = df[col_valor].str.replace(',', '.')
            except:
                pass
            df[col_valor] = pd.to_numeric(df[col_valor])
        return

    def dividir_parcelas_periodo(df,
                                date_col = 'data',
                                n_parcelas_col = 'parcelas',
                                value_col = 'valor',
                                print_progress = False):
        """
        Function to be applied on a Pandas DataFrame that generate new rows dividing the value by the number of parcelas.

        Parameters
        ----------
        df : Pandas Data Frame,
            Database
        date_col : str,
            List of values colnames to be converted
        n_parcelas_col : str,
            List of values colnames to be converted
        value_col : str,
            Total value colname to be divided by parcela values
        print_progress : bool,
            Print for debug

        Returns
        -------
        Pandas Data Frame
            dataframe
        """

        def add_months(mydate, plus_n_months, print_progress = False):
            """
            Sub-function used by dividir_parcelas_periodo.

            Parameters
            ----------
            mydate : datetime Obj,
                datetime Obj
            plus_n_months : int,
                Months to be added
            print_progress : bool,
                Print for debug

            Returns
            -------
            datetime
                datetime value
            """
            day = mydate.day
            month = ((mydate.month + plus_n_months) % 12)
            year = mydate.year + ((mydate.month + plus_n_months) / 12.01)

            if (month == 2) & (day > 28):
                day = 28
            if day > 30:
                day = 30
            if month == 0:
                month = 12

            data_final = datetime(int(math.floor(year)), month, day)
            if print_progress:
                print('[add_months]', year, '/',month, '/',day)
                print('[add_months] output:',data_final)
            return data_final


        df = df.copy().reset_index()
        res = []
        for id_, date_value, parcela_value, valor_value  in zip(df.index, df[date_col],df[n_parcelas_col], df[value_col]) :

            final_value = valor_value / parcela_value
            if print_progress:
                print('########################')
                print('[dividir_parcelas_periodo]data: ', date_value)
                print('[dividir_parcelas_periodo]parcelas: ', parcela_value)
                print('[dividir_parcelas_periodo]valor: ', valor_value)
                print('[dividir_parcelas_periodo]valor da parcela: ', final_value)

            for parcelas in range(int(parcela_value)):
                res.append([id_, add_months(date_value, parcelas, print_progress = print_progress), parcelas, final_value])

        res_df = pd.DataFrame(res, columns=['id', date_col, n_parcelas_col, value_col])
        df.drop([date_col, n_parcelas_col, value_col], axis=1, inplace=True)
        df.index.name = 'id'
        df.reset_index(inplace=True)
        df = pd.merge(df, res_df, how='left', on='id')
        return df

    def pivot_calendario(df,
                     key_colname_list,
                     pivoted_colmns = ['mes'],
                     value_colum = 'valor',
                     sum_colmns = True,
                     sum_colmns_cum = False,
                     fill_na = 0 ):
        """
        Função pivota um DF agrupado pelo groupby e calcula um total da linha se necessário.

        Parameters
        ----------
        df : datetime Obj,
            datetime Obj
        key_colname_list : list,
            Colnames to be used as key to group data
        pivoted_colmns : bool,
            Print for debug
        value_colum : bool,
            Value to be aggregated
        sum_colmns : bool,
            Show sum of columns and rows
        sum_colmns_cum : bool,
            Show comsum
        fill_na : int,
            Fillna value

        Returns
        -------
        pd DataFrame
        """

        data = df.groupby(key_colname_list + pivoted_colmns,as_index=False)[value_colum].sum()

        #Adcionando coluna de total
        if sum_colmns:
            total = df.groupby(pivoted_colmns,as_index=False)[value_colum].sum()
            total_cum = total.copy()
            total_cum[value_colum] = total_cum[value_colum].cumsum()
            for item in key_colname_list:
                total[item] = '~Total'
                total_cum[item] = '~Total_cum'
            if sum_colmns_cum:
                data = pd.concat([data, total, total_cum],sort=True)
            else:
                data = pd.concat([data, total],sort=True)

            x = pd.pivot_table(data, values= value_colum , index = key_colname_list, columns = pivoted_colmns, aggfunc=np.sum, fill_value=0).round(2).reset_index()
            x.replace(0,0)
            x.sort_values(key_colname_list[0], inplace=True)
            x = x.set_index(key_colname_list)

            x['Total'] = x.sum(axis= 1)

        return x

    def tratamento_caracteres(texto_list, lista_replace = None):
        """
        Função:
            - Retira os espaços por '_';
            - Substitúi caracteres especiais;
            - Remove espaços em branco do final;
        Atualmente troca o seguinte padrão:
            (' ', '_'),
            (r'à|á|ã', 'a'),
            (r'ç', 'c'),
            (r'õ|ó|ò', 'o'),
            (r'é|ê', 'e'),
            (r'í|ì', 'i'),
            (r'ú|ù', 'u')
        Parameters
        ----------
        texto_list : str or list,
            datetime Obj
        lista_replace: list
            List of tuples (defaut:
                            [(' ', '_'), (r'à|á|ã', 'a'), (r'ç', 'c'),
                            (r'õ|ó|ò', 'o'), (r'é|ê', 'e'), (r'í|ì', 'i'),
                            (r'ú|ù', 'u')])

        Returns
        -------
        datetime
            datetime value
        """

        #Lista que será substituída
        if lista_replace is None:
            lista_replace = [ (' ', '_'),
                            (r'à|á|ã', 'a'),
                            (r'ç', 'c'),
                            (r'õ|ó|ò', 'o'),
                            (r'é|ê', 'e'),
                            (r'í|ì', 'i'),
                            (r'ú|ù', 'u')]
        res = []

        if isinstance(texto_list, str):
            texto_list = [texto_list]
            is_str = True
        else:
            is_str = False

        for texto in texto_list:
            #Removendo caracteres
            for termo in lista_replace:
                texto = re.sub(termo[0], termo[1], texto.lower())
            try:
                while texto[-1] == '_':
                    texto = texto[0:-1]
            except:
                pass

            res.append(texto)
        #Convertendo para string caso o termo passado seja str
        if is_str:
            res = res[0]
        return res

        def a_day_in_previous_month(dt, months=1, format_str=True):
            for period in range(months):
                dt = dt.replace(day=1) - timedelta(days=1)
                dt = dt.replace(day=1)

            if format_str:
                dt = dt.strftime('%Y-%m-%d')

            return dt


    # def calculo_de_parcelas(df, parcelas_colname='numero_de_parcelas', valor_colname='valor' , div_valor_parcelas=True):

    # df = df.copy()

    # df['contagem_parcelas'] = df[parcelas_colname]

    # #Separando DF entre 1 parcela e mais do que 1
    # df_parcelas = df[(df[parcelas_colname] > 1)].copy()
    # df_parcelas_1 = df[(df[parcelas_colname] == 1)].copy()

    # #Divisão do montante pelas parcelas, caso True
    # if div_valor_parcelas:
    #     df_parcelas[valor_colname] = df_parcelas[valor_colname] / df_parcelas[parcelas_colname]

    # #Loop no DF de parcelas
    # df_temp = df_parcelas.copy()
    # for i in range(len(df_temp)):

    #     #Selecionando a linha e copiando
    #     df_copy_parcelas = df_temp.iloc[i:i+1,:].copy()

    #     #Loop que altera a data da linha
    #     for e in range(int(df_copy_parcelas[parcelas_colname][:1])-1):
    #     if int(df_copy_parcelas.mes) == 12:
    #         df_copy_parcelas.mes = 1
    #         df_copy_parcelas.ano = df_copy_parcelas.ano +  1
    #         df_copy_parcelas['contagem_parcelas'] = df_copy_parcelas[parcelas_colname][:1] - e - 1
    #     else:
    #         df_copy_parcelas.mes = df_copy_parcelas.mes + 1
    #         df_copy_parcelas['contagem_parcelas'] = df_copy_parcelas[parcelas_colname][:1] - e - 1
    #     df_parcelas = pd.concat([df_parcelas, df_copy_parcelas])

    # df_parcelas.dia = df_parcelas.dia.apply(lambda x: 28 if x > 28 else x)
    # #Juntando DF Parcelado com o Não parcelado
    # df = pd.concat([df_parcelas, df_parcelas_1])
    # df['Data'] = pd.to_datetime(df['ano'].apply(str) + '-' + df['mes'].apply(str) + '-' + df['dia'].apply(str))
    # df = df.set_index('Data')

    # return df

    # def pivot_calendario(df, col1, list_col_datas = ['mes'], value_colum = 'valor', Total = True, fill_na = 0 ):
    # """
    # Função pivota um DF agrupado pelo groupby e calcula um total da linha se necessário.
    # """

    # data = df.groupby(col1 + list_col_datas,as_index=False)[value_colum].sum()

    # #Adcionando coluna de total
    # if Total:
    #     total = df.groupby(list_col_datas,as_index=False)[value_colum].sum()
    #     for item in col1:
    #     total[item] = '~Total'
    #     data = pd.concat([data, total],sort=True)

    # x = pd.pivot_table(data, values= value_colum , index = col1, columns = list_col_datas, aggfunc=np.sum, fill_value=0).round(2).reset_index()
    # x.replace(0,fill_na)
    # x.sort_values(col1[0], inplace=True)
    # x = x.set_index(col1)

    # return x

    # def tratamento_caracteres(texto_list):
    # """
    # Função:
    #     - Retira os espaços por '_';
    #     - Substitúi caracteres especiais;
    #     - Remove espaços em branco do final;
    # """

    # #Lista que será substituída
    # lista_replace = [ (' ', '_'),
    #                 (r'à|á|ã', 'a'),
    #                 (r'ç', 'c'),
    #                 (r'õ|ó|ò', 'o'),
    #                 (r'é|ê', 'e'),
    #                 (r'í|ì', 'i'),
    #                 (r'ú|ù', 'u')]
    # res = []

    # if isinstance(texto_list, str):
    #     texto = texto_list.lower()

    #     for termo in lista_replace:
    #     texto = re.sub(termo[0], termo[1], texto)

    #     #Removendo último caracterte em branco
    #     while texto[-1] == '_':
    #     texto = texto[0:-1]

    #     res.append(texto)

    # else:

    #     for texto in texto_list:
    #     texto = texto.lower()

    #     for termo in lista_replace:
    #         texto = re.sub(termo[0], termo[1], texto)

    #     if texto[-1] == '_':
    #         texto = texto[0:-1]

    #     res.append(texto)
    # return res


    # def juros_cumulativo_mensal(juros, parcelas, montante):
    # amortizacao = montante / parcelas
    # res = []
    # juros_list = []
    # amort_list = [amortizacao] * parcelas
    # for parcela in range(parcelas):
    #     if parcela == 0:
    #     res.append(amortizacao)
    #     else:
    #     res.append(res[parcela - 1] * (1 + juros))

    # juros_list = np.array(res) - np.array(amort_list)
    # return  {'amortizacao': amort_list, 'juros' : juros_list, 'valor_parcela' : res}

    # def amortizacao_tabela_sac(juros, parcelas, montante):
    # amortizacao = montante / parcelas
    # res = []
    # juros_list = []
    # amort_list = []

    # for i in range(parcelas):
    #     juros_periodo = (juros * (montante - (i * amortizacao)) )
    #     juros_list.append(juros_periodo)
    #     amort_list.append(amortizacao)
    #     res.append(amortizacao + juros_periodo)

    # #1666,67 + 0,68%*(200000-1*1666,67) = 1.348,66
    # return {'amortizacao': amort_list, 'juros' : juros_list, 'valor_parcela' : res}

    # def amortizacao_tabela_price(juros, parcelas, montante):
    # res = [np.pmt(juros, parcelas, montante) * -1] * parcelas
    # juros_list = np.ipmt(juros, range(1, parcelas + 1), parcelas, montante) * -1
    # amort_list = np.ppmt(juros, range(1, parcelas + 1), parcelas, montante) * -1


    # return {'amortizacao': amort_list, 'juros' : juros_list, 'valor_parcela' : res}