Add TQDM to pandas_to_eland()

V1NAY8 · V1NAY8 · commit 9a2f9c8c6aa7 · 2021-09-24T22:25:41.000+05:30
diff --git a/eland/etl.py b/eland/etl.py
@@ -22,6 +22,7 @@
 import pandas as pd  # type: ignore
 from elasticsearch import Elasticsearch
 from elasticsearch.helpers import parallel_bulk
+from tqdm.notebook import tqdm  # type: ignore
 
 from eland import DataFrame
 from eland.common import DEFAULT_CHUNK_SIZE, PANDAS_VERSION, ensure_es_client
@@ -46,6 +47,7 @@ def pandas_to_eland(
     thread_count: int = 4,
     chunksize: Optional[int] = None,
     use_pandas_index_for_es_ids: bool = True,
+    show_progressbar: Optional[bool] = None,
 ) -> DataFrame:
     """
     Append a pandas DataFrame to an Elasticsearch index.
@@ -79,6 +81,10 @@ def pandas_to_eland(
     use_pandas_index_for_es_ids: bool, default 'True'
         * True: pandas.DataFrame.index fields will be used to populate Elasticsearch '_id' fields.
         * False: Ignore pandas.DataFrame.index when indexing into Elasticsearch
+    show_progressbar: Optional[bool], default 'None'
+        * True : show a progress bar only if we detect Jupyter Notebook (for now)
+        * False : don't show a progress bar
+        * None : show a progress bar only if we detect Jupyter Notebook
 
     Returns
     -------
@@ -184,35 +190,62 @@ def pandas_to_eland(
     else:
         es_client.indices.create(index=es_dest_index, body=mapping)
 
+    if show_progressbar is None or show_progressbar is True:
+        # Detect jupyter notebook
+        try:
+            from IPython import get_ipython  # type: ignore
+
+            ip = get_ipython()
+            if hasattr(ip, "kernel"):
+                show_progressbar = True
+        except ImportError:
+            show_progressbar = False
+
     def action_generator(
         pd_df: pd.DataFrame,
         es_dropna: bool,
         use_pandas_index_for_es_ids: bool,
         es_dest_index: str,
+        show_progressbar: Optional[bool],
     ) -> Generator[Dict[str, Any], None, None]:
-        for row in pd_df.iterrows():
-            if es_dropna:
-                values = row[1].dropna().to_dict()
-            else:
-                values = row[1].to_dict()
-
-            if use_pandas_index_for_es_ids:
-                # Use index as _id
-                id = row[0]
-
-                action = {"_index": es_dest_index, "_source": values, "_id": str(id)}
-            else:
-                action = {"_index": es_dest_index, "_source": values}
 
-            yield action
+        with tqdm(
+            total=pd_df.shape[0],
+            disable=not show_progressbar,
+            desc="Progress",
+        ) as progress_bar:
+            for row in pd_df.iterrows():
+                if es_dropna:
+                    values = row[1].dropna().to_dict()
+                else:
+                    values = row[1].to_dict()
+
+                if use_pandas_index_for_es_ids:
+                    # Use index as _id
+                    id = row[0]
+
+                    action = {
+                        "_index": es_dest_index,
+                        "_source": values,
+                        "_id": str(id),
+                    }
+                else:
+                    action = {"_index": es_dest_index, "_source": values}
+
+                progress_bar.update(1)
+                yield action
 
     # parallel_bulk is lazy generator so use deque to consume them immediately
     # maxlen = 0 because don't need results of parallel_bulk
     deque(
         parallel_bulk(
             client=es_client,
             actions=action_generator(
-                pd_df, es_dropna, use_pandas_index_for_es_ids, es_dest_index
+                pd_df=pd_df,
+                es_dropna=es_dropna,
+                use_pandas_index_for_es_ids=use_pandas_index_for_es_ids,
+                es_dest_index=es_dest_index,
+                show_progressbar=show_progressbar,
             ),
             thread_count=thread_count,
             chunk_size=int(chunksize / thread_count),
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -11,3 +11,7 @@ nox
 lightgbm
 pytest-cov
 mypy
+tqdm
+jupyter
+notebook
+ipywidgets
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
 elasticsearch>=7.7
 pandas>=1
 matplotlib
+tqdm
diff --git a/setup.py b/setup.py
@@ -75,6 +75,7 @@
         "pandas>=1.2,<1.4",
         "matplotlib",
         "numpy",
+        "tqdm",
     ],
     python_requires=">=3.7",
     package_data={"eland": ["py.typed"]},
diff --git a/tests/etl/test_pandas_to_eland.py b/tests/etl/test_pandas_to_eland.py
@@ -70,14 +70,16 @@ def test_es_if_exists_fail(self):
             "to 'append' or 'replace' data."
         )
 
-    def test_es_if_exists_replace(self):
+    @pytest.mark.parametrize("show_progressbar", [True, False, None])
+    def test_es_if_exists_replace(self, show_progressbar):
         # Assert that 'replace' allows for creation
         df1 = pandas_to_eland(
             pd_df2,
             es_client=ES_TEST_CLIENT,
             es_dest_index="test-index",
             es_if_exists="replace",
             es_refresh=True,
+            show_progressbar=show_progressbar,
         ).to_pandas()
         assert_frame_equal(pd_df2, df1)
 
diff --git a/tests/notebook/test_etl.ipynb b/tests/notebook/test_etl.ipynb