From 1fdae298e82e72c233e18436ae6e189637b6d2f2 Mon Sep 17 00:00:00 2001 From: Akshay Jain Date: Tue, 14 Jan 2025 14:11:38 -0800 Subject: [PATCH 1/5] Add samply.py for example testing, modify get_values_for_csv to fix extra decimal points --- pandas/core/indexes/base.py | 5 +++-- sample.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 2 deletions(-) create mode 100644 sample.py diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 165fe109c4c94..8b5c1859c4b77 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -7758,10 +7758,11 @@ def get_values_for_csv( if not quoting: values = values.astype(str) else: - values = np.array(values, dtype="object") + values = np.array(values, dtype="str") # Convert float16 -> string + values = values.astype(float, copy=False) # Parse string -> Python float64 - values[mask] = na_rep values = values.astype(object, copy=False) + values[mask] = na_rep return values from pandas.io.formats.format import FloatArrayFormatter diff --git a/sample.py b/sample.py new file mode 100644 index 0000000000000..d9d2cb7344ea1 --- /dev/null +++ b/sample.py @@ -0,0 +1,29 @@ +import pandas as pd +import numpy as np +import csv + +df = pd.DataFrame({"col": [8.5557]}, dtype=np.float32) + +print(df.to_csv()) + +print(df.to_csv(quoting=csv.QUOTE_NONNUMERIC)) + +values = np.array([8.57], dtype="float32") +print(values) +# [8.57] + +print(np.array(values, dtype="object")) +# [8.569999694824219] + +print(np.array(values, dtype="str")) + + +# Original array in float32 +float32_arr = np.array([1.2345678, 2.3456789], dtype=np.float32) + +# Convert to object +object_arr = float32_arr.astype(object) + +print("Original float32 array:", float32_arr) +print("Object array:", object_arr) +print("Data type of object_arr:", object_arr.dtype) From dc869021f578bfc48e14365ddb15e6e6f61447f3 Mon Sep 17 00:00:00 2001 From: Akshay Jain Date: Mon, 27 Jan 2025 15:16:20 -0800 Subject: [PATCH 2/5] Modified base.py get_values_for_csv() to prevent extra decimal places for float16, float32 in output --- pandas/core/indexes/base.py | 12 ++++++------ sample.py | 29 ----------------------------- 2 files changed, 6 insertions(+), 35 deletions(-) delete mode 100644 sample.py diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 8b5c1859c4b77..a49af51160e55 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -7755,12 +7755,12 @@ def get_values_for_csv( if float_format is None and decimal == ".": mask = isna(values) - if not quoting: - values = values.astype(str) - else: - values = np.array(values, dtype="str") # Convert float16 -> string - values = values.astype(float, copy=False) # Parse string -> Python float64 - + # GH60699 + # Ensure quoting don't add extra decimal places in output for float16, float32 + if values.dtype in [np.float16, np.float32]: + values = np.array(values, dtype="str") + values = values.astype(float, copy=False) + values = values.astype(object, copy=False) values[mask] = na_rep return values diff --git a/sample.py b/sample.py deleted file mode 100644 index d9d2cb7344ea1..0000000000000 --- a/sample.py +++ /dev/null @@ -1,29 +0,0 @@ -import pandas as pd -import numpy as np -import csv - -df = pd.DataFrame({"col": [8.5557]}, dtype=np.float32) - -print(df.to_csv()) - -print(df.to_csv(quoting=csv.QUOTE_NONNUMERIC)) - -values = np.array([8.57], dtype="float32") -print(values) -# [8.57] - -print(np.array(values, dtype="object")) -# [8.569999694824219] - -print(np.array(values, dtype="str")) - - -# Original array in float32 -float32_arr = np.array([1.2345678, 2.3456789], dtype=np.float32) - -# Convert to object -object_arr = float32_arr.astype(object) - -print("Original float32 array:", float32_arr) -print("Object array:", object_arr) -print("Data type of object_arr:", object_arr.dtype) From c6aea1eeba6fcca90b2cc44bd58d86f0b6cfafe0 Mon Sep 17 00:00:00 2001 From: Akshay Jain Date: Mon, 27 Jan 2025 15:18:36 -0800 Subject: [PATCH 3/5] Add tests to check to_csv for dtypes - float16, float32, float64 and quoting option enabled --- pandas/tests/frame/methods/test_to_csv.py | 34 +++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 9eafc69013ffe..d56d98ef2fca5 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -1316,6 +1316,40 @@ def test_to_csv_quoting(self): expected = tm.convert_rows_list_to_csv_str(expected_rows) assert df.to_csv(quoting=csv.QUOTE_ALL) == expected + @pytest.mark.parametrize("data, dtype, expected_rows", + [ + # Test Case 1: float16 precision + ( + {"col": [8.57, 0.156, -0.312, 123.3, -54.5, np.nan]}, + "float16", + ['"","col"', '0,8.57', '1,0.156', '2,-0.312', '3,123.3', '4,-54.5', '5,""'] + ), + + # Test Case 2: float32 precision + ( + {"col": [8.57, 1.234567, -2.345678, 1e6, -1.5e6, np.nan]}, + "float32", + ['"","col"', '0,8.57', '1,1.234567', '2,-2.345678', '3,1000000.0', '4,-1500000.0', '5,""'] + ), + + # Test Case 3: float64 precision + ( + {"col": [8.57, 3.141592653589793, -2.718281828459045, 1.01e12, -5.67e11, np.nan]}, + "float64", + ['"","col"', '0,8.57', '1,3.141592653589793', '2,-2.718281828459045', + '3,1010000000000.0', '4,-567000000000.0', '5,""'] + ), + ] + ) + def test_to_csv_decimal_and_nonnumeric_quoting(self, data, dtype, expected_rows): + # https://github.com/pandas-dev/pandas/issues/60699 + # combination of float dtype, no special formatting and + # quoting is specified (quoting=csv.QUOTE_NONNUMERIC) + df = pd.DataFrame(data, dtype=dtype) + result = df.to_csv(quoting=csv.QUOTE_NONNUMERIC) + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected + def test_period_index_date_overflow(self): # see gh-15982 From d02a7a29dce072381335789fface9254b1feb221 Mon Sep 17 00:00:00 2001 From: Akshay Jain Date: Mon, 27 Jan 2025 15:19:22 -0800 Subject: [PATCH 4/5] Add entry in whatsnew for bugfix --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index bf1b52d3a0957..287adf04194bd 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -697,6 +697,7 @@ I/O - Bug in :meth:`.DataFrame.to_json` when ``"index"`` was a value in the :attr:`DataFrame.column` and :attr:`Index.name` was ``None``. Now, this will fail with a ``ValueError`` (:issue:`58925`) - Bug in :meth:`DataFrame._repr_html_` which ignored the ``"display.float_format"`` option (:issue:`59876`) - Bug in :meth:`DataFrame.from_records` where ``columns`` parameter with numpy structured array was not reordering and filtering out the columns (:issue:`59717`) +- Bug in :meth:`DataFrame.to_csv` where `quoting=csv.QUOTE_NONNUMERIC` adds extra decimal places when ``dtype=float32``, ``dtype=float16`` and ``float_format=None`` in the csv output (:issue:`60699`) - Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`) - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`) - Bug in :meth:`DataFrame.to_excel` where the :class:`MultiIndex` index with a period level was not a date (:issue:`60099`) From 92512830bf013e640d1f19d055ea7fc356058046 Mon Sep 17 00:00:00 2001 From: Akshay Jain Date: Mon, 27 Jan 2025 15:25:43 -0800 Subject: [PATCH 5/5] Fix ruff fomatting, linting, namespace issues using pre-commit hooks --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/core/indexes/base.py | 9 ++-- pandas/tests/frame/methods/test_to_csv.py | 57 +++++++++++++++++------ 3 files changed, 50 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 287adf04194bd..a8c7c975a640b 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -697,7 +697,7 @@ I/O - Bug in :meth:`.DataFrame.to_json` when ``"index"`` was a value in the :attr:`DataFrame.column` and :attr:`Index.name` was ``None``. Now, this will fail with a ``ValueError`` (:issue:`58925`) - Bug in :meth:`DataFrame._repr_html_` which ignored the ``"display.float_format"`` option (:issue:`59876`) - Bug in :meth:`DataFrame.from_records` where ``columns`` parameter with numpy structured array was not reordering and filtering out the columns (:issue:`59717`) -- Bug in :meth:`DataFrame.to_csv` where `quoting=csv.QUOTE_NONNUMERIC` adds extra decimal places when ``dtype=float32``, ``dtype=float16`` and ``float_format=None`` in the csv output (:issue:`60699`) +- Bug in :meth:`DataFrame.to_csv` where ``quoting=csv.QUOTE_NONNUMERIC`` adds extra decimal places when ``dtype=float32``, ``dtype=float16`` and ``float_format=None`` in the csv output (:issue:`60699`) - Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`) - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`) - Bug in :meth:`DataFrame.to_excel` where the :class:`MultiIndex` index with a period level was not a date (:issue:`60099`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a49af51160e55..543cbb46a67b0 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -7756,11 +7756,12 @@ def get_values_for_csv( mask = isna(values) # GH60699 - # Ensure quoting don't add extra decimal places in output for float16, float32 + # Ensure quoting don't add extra decimal places in output + # for float16, float32 if values.dtype in [np.float16, np.float32]: - values = np.array(values, dtype="str") - values = values.astype(float, copy=False) - + values = np.array(values, dtype="str") + values = values.astype(float, copy=False) + values = values.astype(object, copy=False) values[mask] = na_rep return values diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index d56d98ef2fca5..b3881f97beeeb 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -1316,37 +1316,68 @@ def test_to_csv_quoting(self): expected = tm.convert_rows_list_to_csv_str(expected_rows) assert df.to_csv(quoting=csv.QUOTE_ALL) == expected - @pytest.mark.parametrize("data, dtype, expected_rows", + @pytest.mark.parametrize( + "data, dtype, expected_rows", [ # Test Case 1: float16 precision ( {"col": [8.57, 0.156, -0.312, 123.3, -54.5, np.nan]}, "float16", - ['"","col"', '0,8.57', '1,0.156', '2,-0.312', '3,123.3', '4,-54.5', '5,""'] + [ + '"","col"', + "0,8.57", + "1,0.156", + "2,-0.312", + "3,123.3", + "4,-54.5", + '5,""', + ], ), - # Test Case 2: float32 precision ( - {"col": [8.57, 1.234567, -2.345678, 1e6, -1.5e6, np.nan]}, + {"col": [8.57, 1.234567, -2.345678, 1e6, -1.5e6, np.nan]}, "float32", - ['"","col"', '0,8.57', '1,1.234567', '2,-2.345678', '3,1000000.0', '4,-1500000.0', '5,""'] + [ + '"","col"', + "0,8.57", + "1,1.234567", + "2,-2.345678", + "3,1000000.0", + "4,-1500000.0", + '5,""', + ], ), - # Test Case 3: float64 precision ( - {"col": [8.57, 3.141592653589793, -2.718281828459045, 1.01e12, -5.67e11, np.nan]}, + { + "col": [ + 8.57, + 3.141592653589793, + -2.718281828459045, + 1.01e12, + -5.67e11, + np.nan, + ] + }, "float64", - ['"","col"', '0,8.57', '1,3.141592653589793', '2,-2.718281828459045', - '3,1010000000000.0', '4,-567000000000.0', '5,""'] + [ + '"","col"', + "0,8.57", + "1,3.141592653589793", + "2,-2.718281828459045", + "3,1010000000000.0", + "4,-567000000000.0", + '5,""', + ], ), - ] + ], ) def test_to_csv_decimal_and_nonnumeric_quoting(self, data, dtype, expected_rows): # https://github.com/pandas-dev/pandas/issues/60699 - # combination of float dtype, no special formatting and + # combination of float dtype, no special formatting and # quoting is specified (quoting=csv.QUOTE_NONNUMERIC) - df = pd.DataFrame(data, dtype=dtype) - result = df.to_csv(quoting=csv.QUOTE_NONNUMERIC) + df = DataFrame(data, dtype=dtype) + result = df.to_csv(quoting=csv.QUOTE_NONNUMERIC) expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected