Merge pull request #3 from slgero/optimize_pandas_apply

slgero · web-flow · commit cdad91bd744d · 2020-08-04T13:24:32.000+03:00
Optimize pandas apply
diff --git a/receipt_parser/__init__.py b/receipt_parser/__init__.py
@@ -1,6 +1,6 @@
 """A package which allow parsing Reussian receipts."""
 
-__version__ = "0.0.23"
+__version__ = "0.0.24"
 __license__ = "MIT"
 
 
diff --git a/receipt_parser/finder.py b/receipt_parser/finder.py
@@ -15,6 +15,42 @@
 # pylint: disable=C1801
 
 
+def df_apply(data: pd.DataFrame, func, axis: int = 1) -> pd.DataFrame:
+    """
+        User define the `apply` function from pd.DataFrame.
+        Use only for 2-column and 3-column data.
+
+        Parameters
+        ----------
+        data : pd.DataFrame
+            The data on which the `func` function will be applied.
+        func : function
+            Function to apply to each column or row.
+        axis : {0 or 'index', 1 or 'columns'}, default=1
+            Axis along which the function is applied.
+
+        Returns
+        -------
+        pd.DataFrame
+            Result of applying ``func`` along the given axis of the
+            DataFrame.
+
+        Examples
+        --------
+        >>> from pandas import DataFrame
+
+        >>> DataFrame.my_apply = df_apply
+        >>> df[['name', 'brand']].my_apply(foo)
+        """
+
+    _cols = data.columns
+    _len = len(_cols)
+
+    if _len == 2:
+        return data.apply(lambda x: func(x[_cols[0]], x[_cols[1]]), axis=axis)
+    return data.apply(lambda x: func(x[_cols[0]], x[_cols[1]], x[_cols[2]]), axis=axis)
+
+
 class Finder:
     """
     Search and recognize the name, category and brand of a product
@@ -63,6 +99,7 @@ class Finder:
     def __init__(self, pathes: Optional[Dict[str, str]] = None):
         pathes = pathes or {}
         self.mystem = Mystem()
+        pd.DataFrame.appl = df_apply
 
         # Init model:
         model_params = {"num_class": 21, "embed_dim": 50, "vocab_size": 500}
@@ -308,60 +345,48 @@ def __find_all(self, verbose: int) -> None:
         self.__print_logs("Before:", verbose)
 
         # Find brands:
-        self.data[["name_norm", "brand_norm"]] = self.data.apply(
-            lambda x: self.find_brands(x["name_norm"], x["brand_norm"]), axis=1
-        )
+        self.data[["name_norm", "brand_norm"]] = self.data[
+            ["name_norm", "brand_norm"]
+        ].appl(self.find_brands)
         self.__print_logs("Find brands:", verbose)
 
         # Find product and category:
-        self.data[["name_norm", "product_norm", "cat_norm"]] = self.data.apply(
-            lambda x: self.find_product(x["name_norm"], x["product_norm"]), axis=1
-        )
+        self.data[["name_norm", "product_norm", "cat_norm"]] = self.data[
+            ["name_norm", "product_norm"]
+        ].appl(self.find_product)
         self.__print_logs("Find product and category:", verbose)
 
         # Remove `-`:
         self.data["name_norm"] = self.data["name_norm"].str.replace("-", " ")
-        self.data[["name_norm", "product_norm", "cat_norm"]] = self.data.apply(
-            lambda x: self.find_product(
-                x["name_norm"], x["product_norm"], x["cat_norm"]
-            ),
-            axis=1,
-        )
+        self.data[["name_norm", "product_norm", "cat_norm"]] = self.data[
+            ["name_norm", "product_norm", "cat_norm"]
+        ].appl(self.find_product)
         self.__print_logs(
             "Remove `-` and the second attempt to find a product:", verbose
         )
 
         # Use Mystem:
-        self.data["name_norm"] = self.data.apply(
-            lambda x: self._use_mystem(x["name_norm"], x["product_norm"]), axis=1
-        )
-        self.data[["name_norm", "product_norm", "cat_norm"]] = self.data.apply(
-            lambda x: self.find_product(
-                x["name_norm"], x["product_norm"], x["cat_norm"]
-            ),
-            axis=1,
+        self.data["name_norm"] = self.data[["name_norm", "product_norm"]].appl(
+            self._use_mystem
         )
+        self.data[["name_norm", "product_norm", "cat_norm"]] = self.data[
+            ["name_norm", "product_norm", "cat_norm"]
+        ].appl(self.find_product)
         self.__print_logs(
             "Use Mystem for lemmatization and the third attempt to find a product:",
             verbose,
         )
 
         # Find category:
-        self.data[["product_norm", "cat_norm"]] = self.data.apply(
-            lambda x: self.find_category(
-                x["name_norm"], x["product_norm"], x["cat_norm"]
-            ),
-            axis=1,
-        )
+        self.data[["product_norm", "cat_norm"]] = self.data[
+            ["name_norm", "product_norm", "cat_norm"]
+        ].appl(self.find_category)
         self.__print_logs("Find the remaining categories:", verbose)
 
         # Find product by brand:
-        self.data[["product_norm", "brand_norm", "cat_norm"]] = self.data.apply(
-            lambda x: self.find_product_by_brand(
-                x["product_norm"], x["brand_norm"], x["cat_norm"]
-            ),
-            axis=1,
-        )
+        self.data[["product_norm", "brand_norm", "cat_norm"]] = self.data[
+            ["name_norm", "product_norm", "cat_norm"]
+        ].appl(self.find_product)
         self.__print_logs("Find product by brand:", verbose)
 
     def find_all(
diff --git a/receipt_parser/normalizer.py b/receipt_parser/normalizer.py
@@ -2,6 +2,9 @@
 import re
 from typing import Optional, Union, Dict
 import pandas as pd  # type: ignore
+from pandarallel import pandarallel  # type: ignore
+
+pandarallel.initialize(progress_bar=False, verbose=0)
 
 try:
     # pylint: disable=line-too-long
@@ -10,6 +13,87 @@
     from dicts import PRODUCTS, BRANDS, SLASH_PRODUCTS, BRANDS_WITH_NUMBERS  # type: ignore
 
 
+# pylint: disable=bad-continuation
+class Apply:
+    """User define the `apply` function from pd.Series and pd.DataFrame"""
+
+    @staticmethod
+    def series_apply(data: pd.Series, func, use_parallel: Optional[bool] = None):
+        """
+        User define the `apply` function from pd.Series.
+
+        Parameters
+        ----------
+        data : pd.Series
+            The data on which the `func` function will be applied.
+        func : function
+            Function to apply to each column or row.
+        use_parallel : Optional[bool], default=None
+            Multiprocessing will be used if the data size is greater than 30000.
+
+        Returns
+        -------
+        pd.DataFrame
+            Result of applying ``func`` on the Series.
+
+        Examples
+        --------
+        >>> from pandas import Series
+
+        >>> Series.my_apply = series_apply
+        >>> df['name'].my_apply(foo)
+        """
+
+        if use_parallel is None:
+            use_parallel = len(data) >= 10000
+        if use_parallel:
+            return data.parallel_apply(func)
+        return data.apply(func)
+
+    @staticmethod
+    def df_apply(
+        data: pd.DataFrame, func, use_parallel: Optional[bool] = None, axis: int = 1
+    ) -> pd.DataFrame:
+        """
+        User define the `apply` function from pd.DataFrame.
+        Use only for 2-column data.
+
+        Parameters
+        ----------
+        data : pd.DataFrame
+            The data on which the `func` function will be applied.
+        func : function
+            Function to apply to each column or row.
+        use_parallel : Optional[bool], default=None
+            Multiprocessing will be used if the data size is greater than 30000.
+        axis : {0 or 'index', 1 or 'columns'}, default=1
+            Axis along which the function is applied.
+
+        Returns
+        -------
+        pd.DataFrame
+            Result of applying ``func`` along the given axis of the DataFrame.
+
+        Examples
+        --------
+        >>> from pandas import DataFrame
+
+        >>> DataFrame.my_apply = df_apply
+        >>> df[['name', 'brand']].my_apply(foo)
+        """
+
+        _cols = data.columns
+
+        if use_parallel is None:
+            use_parallel = len(data) >= 10000
+
+        if use_parallel:
+            return data.parallel_apply(
+                lambda x: func(x[_cols[0]], x[_cols[1]]), axis=axis
+            )
+        return data.apply(lambda x: func(x[_cols[0]], x[_cols[1]]), axis=axis)
+
+
 class Normalizer:
     """
     Normalize product description: expand abbreviations,
@@ -52,6 +136,10 @@ def __init__(self, pathes: Optional[Dict[str, str]] = None):
             pathes.get("brands_en", "data/cleaned/brands_en.csv")
         )["brand"].values
 
+        # Init user define apply function:
+        pd.DataFrame.appl = Apply.df_apply
+        pd.Series.appl = Apply.series_apply
+
     @staticmethod
     def _remove_numbers(name: str) -> pd.Series:
         """Remove all words in product description which contain numbers."""
@@ -162,20 +250,17 @@ def normalize(self, data: Union[pd.Series, str]) -> pd.DataFrame:
 
         data = self.__transform_data(data)
         data["name_norm"] = data["name"].str.lower()
-        data[["name_norm", "brand_norm"]] = data["name_norm"].apply(
-            self._remove_numbers
-        )
-        data[["name_norm", "product_norm", "brand_norm"]] = data.apply(
-            lambda x: self._remove_punctuation(x["name_norm"], x["brand_norm"]), axis=1
-        )
-        data["name_norm"] = data["name_norm"].apply(self._remove_one_and_two_chars)
-        data[["name_norm", "brand_norm"]] = data.apply(
-            lambda x: self.find_en_brands(x["name_norm"], x["brand_norm"]), axis=1
+        data[["name_norm", "brand_norm"]] = data["name_norm"].appl(self._remove_numbers)
+        data[["name_norm", "product_norm", "brand_norm"]] = data[
+            ["name_norm", "brand_norm"]
+        ].appl(self._remove_punctuation)
+        data["name_norm"] = data["name_norm"].appl(self._remove_one_and_two_chars)
+        data[["name_norm", "brand_norm"]] = data[["name_norm", "brand_norm"]].appl(
+            self.find_en_brands
         )
-        data["name_norm"] = data["name_norm"].apply(self._remove_words_in_blacklist)
-        data["name_norm"] = data["name_norm"].apply(self._replace_with_product_dict)
-        data[["name_norm", "brand_norm"]] = data.apply(
-            lambda x: self._remove_all_english_words(x["name_norm"], x["brand_norm"]),
-            axis=1,
+        data["name_norm"] = data["name_norm"].appl(self._remove_words_in_blacklist)
+        data["name_norm"] = data["name_norm"].appl(self._replace_with_product_dict)
+        data[["name_norm", "brand_norm"]] = data[["name_norm", "brand_norm"]].appl(
+            self._remove_all_english_words
         )
         return data
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,8 @@
 numpy >= 1.18.3
 pandas >= 1.0.3
+pandarallel >= 1.4.8
 pymystem3 >= 0.2.0
+setuptools
 torch
 torchvision
 wget >= 3.2