diff --git a/graphistry/feature_utils.py b/graphistry/feature_utils.py index 26214f3a69..79084dadcb 100644 --- a/graphistry/feature_utils.py +++ b/graphistry/feature_utils.py @@ -393,6 +393,28 @@ def convert_money_string_to_float(money: str): mask = where_is_currency_column(df, col) df[col, mask] = df[col, mask].apply(convert_money_string_to_float) +def try_coerce_to_numeric(ndf: pd.DataFrame): + try: + nndf = ndf.copy() + object_columns = nndf.select_dtypes(include=['object']).columns + for j in object_columns: + num_floats = sum(isinstance(x, float) for x in nndf[j].dropna()) + if num_floats > len(nndf[j]) / 2: # most of column is float + try: + nndf[j] = [float(value) if not isinstance(value, float) else value for value in nndf[j]] + logger.info("Coerced strings to floats") + except: + # nndf[j] = nndf[j].apply(lambda x: str(x).split() if isinstance(x, str) and ' ' in x else x) + # nndf = nndf.explode(j) + # logger.info("Exploded rows with multiple values in single cell") + nndf[j] = nndf[j].apply(lambda x: str(x).split()[0] if isinstance(x, str) and ' ' in x else x) + nndf[j] = nndf[j].astype(float) + nndf.reset_index(drop=True, inplace=True) + logger.info("took first float of tuple in single cell") + + except: + pass + return nndf def is_dataframe_all_numeric(df: pd.DataFrame) -> bool: is_all_numeric = True @@ -890,6 +912,7 @@ def process_dirty_dataframes( from sklearn.preprocessing import FunctionTransformer t = time() + ndf = try_coerce_to_numeric(ndf) all_numeric = is_dataframe_all_numeric(ndf) if not all_numeric and has_dirty_cat: data_encoder = SuperVectorizer( diff --git a/graphistry/tests/test_feature_utils.py b/graphistry/tests/test_feature_utils.py index fa4333737a..7072a16f77 100644 --- a/graphistry/tests/test_feature_utils.py +++ b/graphistry/tests/test_feature_utils.py @@ -438,6 +438,24 @@ def test_edge_scaling(self): return_scalers=True) + @pytest.mark.skipif(not has_min_dependancy or not has_min_dependancy_text, reason="requires ai feature dependencies") + def test_type_edgecase(self): + df = pd.DataFrame({ + 'A': np.random.rand(50), + 'B': np.random.rand(50) + }) + num_to_convert = int(len(df.A.values) * 0.1) + indices_to_convert = np.random.choice(len(df.A.values), num_to_convert, replace=False) + indices_to_convertB = np.random.choice(len(df.A.values), num_to_convert, replace=False) + for i,j in zip(indices_to_convert, indices_to_convertB): + df.A[i] = str(df.A[i]) + df.B[j] = str(df.B[j]) + df.A.loc[13] = '92.026 123.903 702.124' + df.B.loc[33] = '26.092 903.123' + + graphistry.nodes(df).featurize() + assert True + if __name__ == "__main__": unittest.main() diff --git a/graphistry/tests/test_umap_utils.py b/graphistry/tests/test_umap_utils.py index 3362e3405f..93dd5656a4 100644 --- a/graphistry/tests/test_umap_utils.py +++ b/graphistry/tests/test_umap_utils.py @@ -400,6 +400,19 @@ def test_umap_edgecase(self): graphistry.nodes(df).umap() assert True + + @pytest.mark.skipif(not has_umap, reason="requires umap feature dependencies") + def test_type_edgecase(self): + values = pd.Series(np.random.rand(50)) + num_to_convert = int(len(values) * 0.05) + indices_to_convert = np.random.choice(len(values), num_to_convert, replace=False) + for i in indices_to_convert: + values[i] = str(values[i]) + values.loc[13] = '92.026 123.903 702.124' + values.loc[33] = '26.092 903.123' + + graphistry.nodes(values).umap() + assert True @pytest.mark.skipif(not has_umap, reason="requires umap feature dependencies") def test_node_umap(self):