pandas-dev · jreback · May 28, 2021 · Mar 13, 2021 · Mar 13, 2021 · Mar 13, 2021
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -349,10 +349,29 @@ error_bad_lines : boolean, default ``True``
   returned. If ``False``, then these "bad lines" will dropped from the
   ``DataFrame`` that is returned. See :ref:`bad lines <io.bad_lines>`
   below.
+
+  .. deprecated:: 1.3
+     The ``on_bad_lines`` parameter takes precedence over this parameter
+     when specified and should be used instead to specify behavior upon
+     encountering a bad line instead.
 warn_bad_lines : boolean, default ``True``
   If error_bad_lines is ``False``, and warn_bad_lines is ``True``, a warning for
   each "bad line" will be output.
 
+  .. deprecated:: 1.3
+     The ``on_bad_lines`` parameter takes precedence over this parameter
+     when specified and should be used instead to specify behavior upon
+     encountering a bad line instead.
+on_bad_lines : {{None, 'error', 'warn', 'skip'}}, default ``None``
+    Specifies what to do upon encountering a bad line (a line with too many fields).
+    The default value of None will defer to ``error_bad_lines`` and ``warn_bad_lines``.
+    Specifying 'error' will cause an exception to be raised. Otherwise, the "bad lines"
+    will be dropped from the DataFrame, with a warning raised if 'warn' is specified.
+    This parameter takes precedence over parameters ``error_bad_lines`` and ``warn_bad_lines``
+    if specified.
+
+    .. versionadded:: 1.3
+
 .. _io.dtypes:
 
 Specifying column data types
@@ -1244,7 +1263,7 @@ You can elect to skip bad lines:
 
 .. code-block:: ipython
 
-    In [29]: pd.read_csv(StringIO(data), error_bad_lines=False)
+    In [29]: pd.read_csv(StringIO(data), on_bad_lines="warn")
     Skipping line 3: expected 3 fields, saw 4
 
     Out[29]:

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -480,6 +480,7 @@ Deprecations
 - Deprecated casting ``datetime.date`` objects to ``datetime64`` when used as ``fill_value`` in :meth:`DataFrame.unstack`, :meth:`DataFrame.shift`, :meth:`Series.shift`, and :meth:`DataFrame.reindex`, pass ``pd.Timestamp(dateobj)`` instead (:issue:`39767`)
 - Deprecated :meth:`.Styler.set_na_rep` and :meth:`.Styler.set_precision` in favour of :meth:`.Styler.format` with ``na_rep`` and ``precision`` as existing and new input arguments respectively (:issue:`40134`, :issue:`40425`)
 - Deprecated allowing partial failure in :meth:`Series.transform` and :meth:`DataFrame.transform` when ``func`` is list-like or dict-like and raises anything but ``TypeError``; ``func`` raising anything but a ``TypeError`` will raise in a future version (:issue:`40211`)
+- Deprecated arguments ``error_bad_lines`` and ``warn_bad_lines`` in :meth:`pd.read_csv` in favor of ``on_bad_lines`` (:issue:`15122`)
 - Deprecated support for ``np.ma.mrecords.MaskedRecords`` in the :class:`DataFrame` constructor, pass ``{name: data[name] for name in data.dtype.names}`` instead (:issue:`40363`)
 - Deprecated the use of ``**kwargs`` in :class:`.ExcelWriter`; use the keyword argument ``engine_kwargs`` instead (:issue:`40430`)
 

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -149,6 +149,11 @@ cdef extern from "parser/tokenizer.h":
 
     enum: ERROR_OVERFLOW
 
+    ctypedef enum BadLineHandleMethod:
+        ERROR,
+        WARN,
+        SKIP
+
     ctypedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
                                   int *status, const char *encoding_errors)
     ctypedef int (*io_cleanup)(void *src)
@@ -201,8 +206,7 @@ cdef extern from "parser/tokenizer.h":
         int usecols
 
         int expected_fields
-        int error_bad_lines
-        int warn_bad_lines
+        BadLineHandleMethod on_bad_lines
 
         # floating point options
         char decimal
@@ -353,6 +357,7 @@ cdef class TextReader:
                   usecols=None,
                   bint error_bad_lines=True,
                   bint warn_bad_lines=True,
+                  on_bad_lines = None,
                   bint na_filter=True,
                   na_values=None,
                   na_fvalues=None,
@@ -437,8 +442,23 @@ cdef class TextReader:
             self.parser.commentchar = ord(comment)
 
         # error handling of bad lines
-        self.parser.error_bad_lines = int(error_bad_lines)
-        self.parser.warn_bad_lines = int(warn_bad_lines)
+        if on_bad_lines is not None:
+            if on_bad_lines == "error":
+                self.parser.on_bad_lines = ERROR
+            elif on_bad_lines == "warn":
+                self.parser.on_bad_lines = WARN
+            elif on_bad_lines == "skip":
+                self.parser.on_bad_lines = SKIP
+            else:
+                raise ValueError(f"Argument {on_bad_lines} is invalid for "
+                                 "on_bad_lines")
+        else:
+            if error_bad_lines:
+                self.parser.on_bad_lines = ERROR
+            elif warn_bad_lines:
+                self.parser.on_bad_lines = WARN
+            else:
+                self.parser.on_bad_lines = SKIP
 
         self.skiprows = skiprows
         if skiprows is not None:
@@ -455,8 +475,7 @@ cdef class TextReader:
 
         # XXX
         if skipfooter > 0:
-            self.parser.error_bad_lines = 0
-            self.parser.warn_bad_lines = 0
+            self.parser.on_bad_lines = SKIP
 
         self.delimiter = delimiter
         self.delim_whitespace = delim_whitespace
@@ -571,9 +590,6 @@ cdef class TextReader:
             kh_destroy_str_starts(self.false_set)
             self.false_set = NULL
 
-    def set_error_bad_lines(self, int status):
-        self.parser.error_bad_lines = status
-
     def _set_quoting(self, quote_char, quoting):
         if not isinstance(quoting, int):
             raise TypeError('"quoting" must be an integer')

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
@@ -93,8 +93,7 @@ void parser_set_default_options(parser_t *self) {
     self->allow_embedded_newline = 1;
 
     self->expected_fields = -1;
-    self->error_bad_lines = 0;
-    self->warn_bad_lines = 0;
+    self->on_bad_lines = ERROR;
 
     self->commentchar = '#';
     self->thousands = '\0';
@@ -457,7 +456,7 @@ static int end_line(parser_t *self) {
         self->line_fields[self->lines] = 0;
 
         // file_lines is now the actual file line number (starting at 1)
-        if (self->error_bad_lines) {
+        if (self->on_bad_lines == ERROR) {
             self->error_msg = malloc(bufsize);
             snprintf(self->error_msg, bufsize,
                     "Expected %d fields in line %" PRIu64 ", saw %" PRId64 "\n",
@@ -468,7 +467,7 @@ static int end_line(parser_t *self) {
             return -1;
         } else {
             // simply skip bad lines
-            if (self->warn_bad_lines) {
+            if (self->on_bad_lines == WARN) {
                 // pass up error message
                 msg = malloc(bufsize);
                 snprintf(msg, bufsize,

diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h
@@ -84,6 +84,12 @@ typedef enum {
     QUOTE_NONE
 } QuoteStyle;
 
+typedef enum {
+    ERROR,
+    WARN,
+    SKIP
+} BadLineHandleMethod;
+
 typedef void *(*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
                              int *status, const char *encoding_errors);
 typedef int (*io_cleanup)(void *src);
@@ -136,8 +142,7 @@ typedef struct parser_t {
     int usecols;  // Boolean: 1: usecols provided, 0: none provided
 
     int expected_fields;
-    int error_bad_lines;
-    int warn_bad_lines;
+    BadLineHandleMethod on_bad_lines;
 
     // floating point options
     char decimal;

diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py
@@ -155,9 +155,6 @@ def _set_noconvert_columns(self):
         for col in noconvert_columns:
             self._reader.set_noconvert(col)
 
-    def set_error_bad_lines(self, status):
-        self._reader.set_error_bad_lines(int(status))
-
     def read(self, nrows=None):
         try:
             data = self._reader.read(nrows)

diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
@@ -75,8 +75,19 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds):
         self.quoting = kwds["quoting"]
         self.skip_blank_lines = kwds["skip_blank_lines"]
 
-        self.warn_bad_lines = kwds["warn_bad_lines"]
-        self.error_bad_lines = kwds["error_bad_lines"]
+        if kwds["on_bad_lines"] is not None:
+            if kwds["on_bad_lines"] not in {"error", "warn", "skip"}:
+                raise ValueError(
+                    f"Argument {kwds['on_bad_lines']} is invalid for on_bad_lines"
+                )
+            self.on_bad_lines = kwds["on_bad_lines"]
+        else:
+            if kwds["error_bad_lines"]:
+                self.on_bad_lines = "error"
+            elif kwds["warn_bad_lines"]:
+                self.on_bad_lines = "warn"
+            else:
+                self.on_bad_lines = "skip"
 
         self.names_passed = kwds["names"] or None
 
@@ -666,8 +677,10 @@ def _alert_malformed(self, msg, row_num):
         """
         Alert a user about a malformed row.
 
-        If `self.error_bad_lines` is True, the alert will be `ParserError`.
-        If `self.warn_bad_lines` is True, the alert will be printed out.
+        If `self.on_bad_lines` is 'error' or `self.error_bad_lines` is True,
+        the alert will be `ParserError`.
+        If `self.on_bad_lines` is 'warn' or `self.warn_bad_lines` is True,
+        the alert will be printed out.
 
         Parameters
         ----------
@@ -676,9 +689,9 @@ def _alert_malformed(self, msg, row_num):
                   Because this row number is displayed, we 1-index,
                   even though we 0-index internally.
         """
-        if self.error_bad_lines:
+        if self.on_bad_lines == "error":
             raise ParserError(msg)
-        elif self.warn_bad_lines:
+        elif self.on_bad_lines == "warn":
             base = f"Skipping line {row_num}: "
             sys.stderr.write(base + msg + "\n")
 
@@ -699,7 +712,7 @@ def _next_iter_line(self, row_num):
             assert self.data is not None
             return next(self.data)
         except csv.Error as e:
-            if self.warn_bad_lines or self.error_bad_lines:
+            if self.on_bad_lines == "error" or self.on_bad_lines == "warn":
                 msg = str(e)
 
                 if "NULL byte" in msg or "line contains NUL" in msg:
@@ -896,11 +909,11 @@ def _rows_to_cols(self, content):
                 actual_len = len(l)
 
                 if actual_len > col_len:
-                    if self.error_bad_lines or self.warn_bad_lines:
+                    if self.on_bad_lines == "error" or self.on_bad_lines == "warn":
                         row_num = self.pos - (content_len - i + footers)
                         bad_lines.append((row_num, actual_len))
 
-                        if self.error_bad_lines:
+                        if self.on_bad_lines == "error":
                             break
                 else:
                     content.append(l)

diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
@@ -325,9 +325,29 @@
     default cause an exception to be raised, and no DataFrame will be returned.
     If False, then these "bad lines" will be dropped from the DataFrame that is
     returned.
+
+    .. deprecated:: 1.3
+       The ``on_bad_lines`` parameter takes precedence over this parameter
+       when specified and should be used instead to specify behavior upon
+       encountering a bad line instead.
 warn_bad_lines : bool, default True
     If error_bad_lines is False, and warn_bad_lines is True, a warning for each
     "bad line" will be output.
+
+    .. deprecated:: 1.3
+       The ``on_bad_lines`` parameter takes precedence over this parameter
+       when specified and should be used instead to specify behavior upon
+       encountering a bad line instead.
+on_bad_lines : {{None, 'error', 'warn', 'skip'}}, default None
+    Specifies what to do upon encountering a bad line (a line with too many fields).
+    The default value of None will defer to ``error_bad_lines`` and ``warn_bad_lines``.
+    Specifying 'error' will cause an exception to be raised. Otherwise, the "bad lines"
+    will be dropped from the DataFrame, with a warning raised if 'warn' is specified.
+    This parameter takes precedence over parameters ``error_bad_lines`` and
+    ``warn_bad_lines`` if specified.
+
+    .. versionadded:: 1.3
+
 delim_whitespace : bool, default False
     Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be
     used as the sep. Equivalent to setting ``sep='\\s+'``. If this option
@@ -382,6 +402,7 @@
     "memory_map": False,
     "error_bad_lines": True,
     "warn_bad_lines": True,
+    "on_bad_lines": None,
     "float_precision": None,
 }
 
@@ -390,8 +411,8 @@
 _c_unsupported = {"skipfooter"}
 _python_unsupported = {"low_memory", "float_precision"}
 
-_deprecated_defaults: Dict[str, Any] = {}
-_deprecated_args: Set[str] = set()
+_deprecated_defaults: Dict[str, Any] = {"error_bad_lines": True, "warn_bad_lines": True}
+_deprecated_args: Set[str] = {"error_bad_lines", "warn_bad_lines"}
 
 
 def validate_integer(name, val, min_val=0):
@@ -533,6 +554,8 @@ def read_csv(
     # Error Handling
     error_bad_lines=True,
     warn_bad_lines=True,
+    # TODO: disallow and change None to 'error' in on_bad_lines in 2.0
+    on_bad_lines=None,
     # Internal
     delim_whitespace=False,
     low_memory=_c_parser_defaults["low_memory"],
@@ -613,6 +636,8 @@ def read_table(
     # Error Handling
     error_bad_lines=True,
     warn_bad_lines=True,
+    # TODO: disallow and change None to 'error' in on_bad_lines in 2.0
+    on_bad_lines=None,
     encoding_errors: Optional[str] = "strict",
     # Internal
     delim_whitespace=False,
@@ -924,7 +949,7 @@ def _clean_options(self, options, engine):
                     f"The {arg} argument has been deprecated and will be "
                     "removed in a future version.\n\n"
                 )
-                warnings.warn(msg, FutureWarning, stacklevel=2)
+                warnings.warn(msg, FutureWarning, stacklevel=6)
             else:
                 result[arg] = parser_default
 

diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py
@@ -755,6 +755,23 @@ def test_encoding_surrogatepass(all_parsers):
             parser.read_csv(path)
 
 
+def test_deprecated_bad_lines_warns(all_parsers, csv1):
+    # GH 15122
+    parser = all_parsers
+    with tm.assert_produces_warning(
+        FutureWarning,
+        match="The error_bad_lines argument has been deprecated "
+        "and will be removed in a future version.\n\n",
+    ):
+        parser.read_csv(csv1, error_bad_lines=False)
+    with tm.assert_produces_warning(
+        FutureWarning,
+        match="The warn_bad_lines argument has been deprecated "
+        "and will be removed in a future version.\n\n",
+    ):
+        parser.read_csv(csv1, warn_bad_lines=False)
+
+
 def test_malformed_second_line(all_parsers):
     # see GH14782
     parser = all_parsers