pandas-dev · jreback · May 28, 2021 · Mar 13, 2021 · Mar 13, 2021 · Mar 13, 2021
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -349,10 +349,38 @@ error_bad_lines : boolean, default ``True``
   returned. If ``False``, then these "bad lines" will dropped from the
   ``DataFrame`` that is returned. See :ref:`bad lines <io.bad_lines>`
   below.
+
+  .. deprecated:: 1.3
+     The ``on_bad_lines`` parameter takes precedence over this parameter
+     when specified and should be used instead to specify behavior upon
+     encountering a bad line instead.
 warn_bad_lines : boolean, default ``True``
   If error_bad_lines is ``False``, and warn_bad_lines is ``True``, a warning for
   each "bad line" will be output.
 
+  .. deprecated:: 1.3
+     The ``on_bad_lines`` parameter takes precedence over this parameter
+     when specified and should be used instead to specify behavior upon
+     encountering a bad line instead.
+on_bad_lines : {{None, 'error', 'warn', 'skip'}}, default ``None``
+    Specifies what to do upon encountering a bad line (a line with too many fields).
+    Allowed values are :
+
+        - ``None``, default option, defers to ``error_bad_lines`` and ``warn_bad_lines``.
+
+          Note: This option is only present for backwards-compatibility reasons and will
+          be removed after the removal of ``error_bad_lines`` and ``warn_bad_lines``.
+          Please do not specify it explicitly.
+
+        - 'error', raise an Exception when a bad line is encountered.
+        - 'warn', raise a warning when a bad line is encountered and skip that line.
+        - 'skip', skip bad lines without raising or warning when they are encountered.
+
+    This parameter takes precedence over parameters ``error_bad_lines`` and ``warn_bad_lines``
+    if specified.
+
+    .. versionadded:: 1.3
+
 .. _io.dtypes:
 
 Specifying column data types
@@ -1244,7 +1272,7 @@ You can elect to skip bad lines:
 
 .. code-block:: ipython
 
-    In [29]: pd.read_csv(StringIO(data), error_bad_lines=False)
+    In [29]: pd.read_csv(StringIO(data), on_bad_lines="warn")
     Skipping line 3: expected 3 fields, saw 4
 
     Out[29]:

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -480,6 +480,7 @@ Deprecations
 - Deprecated casting ``datetime.date`` objects to ``datetime64`` when used as ``fill_value`` in :meth:`DataFrame.unstack`, :meth:`DataFrame.shift`, :meth:`Series.shift`, and :meth:`DataFrame.reindex`, pass ``pd.Timestamp(dateobj)`` instead (:issue:`39767`)
 - Deprecated :meth:`.Styler.set_na_rep` and :meth:`.Styler.set_precision` in favour of :meth:`.Styler.format` with ``na_rep`` and ``precision`` as existing and new input arguments respectively (:issue:`40134`, :issue:`40425`)
 - Deprecated allowing partial failure in :meth:`Series.transform` and :meth:`DataFrame.transform` when ``func`` is list-like or dict-like and raises anything but ``TypeError``; ``func`` raising anything but a ``TypeError`` will raise in a future version (:issue:`40211`)
+- Deprecated arguments ``error_bad_lines`` and ``warn_bad_lines`` in :meth:`pd.read_csv` in favor of ``on_bad_lines`` (:issue:`15122`)
 - Deprecated support for ``np.ma.mrecords.MaskedRecords`` in the :class:`DataFrame` constructor, pass ``{name: data[name] for name in data.dtype.names}`` instead (:issue:`40363`)
 - Deprecated the use of ``**kwargs`` in :class:`.ExcelWriter`; use the keyword argument ``engine_kwargs`` instead (:issue:`40430`)
 

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -149,6 +149,11 @@ cdef extern from "parser/tokenizer.h":
 
     enum: ERROR_OVERFLOW
 
+    ctypedef enum BadLineHandleMethod:
+        ERROR,
+        WARN,
+        SKIP
+
     ctypedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
                                   int *status, const char *encoding_errors)
     ctypedef int (*io_cleanup)(void *src)
@@ -201,8 +206,7 @@ cdef extern from "parser/tokenizer.h":
         int usecols
 
         int expected_fields
-        int error_bad_lines
-        int warn_bad_lines
+        BadLineHandleMethod on_bad_lines
 
         # floating point options
         char decimal
@@ -351,8 +355,7 @@ cdef class TextReader:
                   thousands=None,
                   dtype=None,
                   usecols=None,
-                  bint error_bad_lines=True,
-                  bint warn_bad_lines=True,
+                  on_bad_lines = ERROR,
                   bint na_filter=True,
                   na_values=None,
                   na_fvalues=None,
@@ -436,9 +439,7 @@ cdef class TextReader:
                 raise ValueError('Only length-1 comment characters supported')
             self.parser.commentchar = ord(comment)
 
-        # error handling of bad lines
-        self.parser.error_bad_lines = int(error_bad_lines)
-        self.parser.warn_bad_lines = int(warn_bad_lines)
+        self.parser.on_bad_lines = on_bad_lines
 
         self.skiprows = skiprows
         if skiprows is not None:
@@ -455,8 +456,7 @@ cdef class TextReader:
 
         # XXX
         if skipfooter > 0:
-            self.parser.error_bad_lines = 0
-            self.parser.warn_bad_lines = 0
+            self.parser.on_bad_lines = SKIP
 
         self.delimiter = delimiter
         self.delim_whitespace = delim_whitespace
@@ -571,9 +571,6 @@ cdef class TextReader:
             kh_destroy_str_starts(self.false_set)
             self.false_set = NULL
 
-    def set_error_bad_lines(self, int status):
-        self.parser.error_bad_lines = status
-
     def _set_quoting(self, quote_char, quoting):
         if not isinstance(quoting, int):
             raise TypeError('"quoting" must be an integer')

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
@@ -93,8 +93,7 @@ void parser_set_default_options(parser_t *self) {
     self->allow_embedded_newline = 1;
 
     self->expected_fields = -1;
-    self->error_bad_lines = 0;
-    self->warn_bad_lines = 0;
+    self->on_bad_lines = ERROR;
 
     self->commentchar = '#';
     self->thousands = '\0';
@@ -457,7 +456,7 @@ static int end_line(parser_t *self) {
         self->line_fields[self->lines] = 0;
 
         // file_lines is now the actual file line number (starting at 1)
-        if (self->error_bad_lines) {
+        if (self->on_bad_lines == ERROR) {
             self->error_msg = malloc(bufsize);
             snprintf(self->error_msg, bufsize,
                     "Expected %d fields in line %" PRIu64 ", saw %" PRId64 "\n",
@@ -468,7 +467,7 @@ static int end_line(parser_t *self) {
             return -1;
         } else {
             // simply skip bad lines
-            if (self->warn_bad_lines) {
+            if (self->on_bad_lines == WARN) {
                 // pass up error message
                 msg = malloc(bufsize);
                 snprintf(msg, bufsize,

diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h
@@ -84,6 +84,12 @@ typedef enum {
     QUOTE_NONE
 } QuoteStyle;
 
+typedef enum {
+    ERROR,
+    WARN,
+    SKIP
+} BadLineHandleMethod;
+
 typedef void *(*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
                              int *status, const char *encoding_errors);
 typedef int (*io_cleanup)(void *src);
@@ -136,8 +142,7 @@ typedef struct parser_t {
     int usecols;  // Boolean: 1: usecols provided, 0: none provided
 
     int expected_fields;
-    int error_bad_lines;
-    int warn_bad_lines;
+    BadLineHandleMethod on_bad_lines;
 
     // floating point options
     char decimal;

diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
@@ -1,6 +1,7 @@
 from collections import defaultdict
 import csv
 import datetime
+from enum import Enum
 import itertools
 from typing import (
     Any,
@@ -114,6 +115,11 @@
 
 
 class ParserBase:
+    class BadLineHandleMethod(Enum):
+        ERROR = 0
+        WARN = 1
+        SKIP = 2
+
     def __init__(self, kwds):
 
         self.names = kwds.get("names")
@@ -202,6 +208,25 @@ def __init__(self, kwds):
 
         self.handles: Optional[IOHandles] = None
 
+        # Bad line handling
+        on_bad_lines = kwds.get("on_bad_lines")
+        if on_bad_lines is not None:
+            if on_bad_lines == "error":
+                self.on_bad_lines = self.BadLineHandleMethod.ERROR
+            elif on_bad_lines == "warn":
+                self.on_bad_lines = self.BadLineHandleMethod.WARN
+            elif on_bad_lines == "skip":
+                self.on_bad_lines = self.BadLineHandleMethod.SKIP
+            else:
+                raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines")
+        else:
+            if kwds.get("error_bad_lines"):
+                self.on_bad_lines = self.BadLineHandleMethod.ERROR
+            elif kwds.get("warn_bad_lines"):
+                self.on_bad_lines = self.BadLineHandleMethod.WARN
+            else:
+                self.on_bad_lines = self.BadLineHandleMethod.SKIP
+
     def _open_handles(self, src: FilePathOrBuffer, kwds: Dict[str, Any]) -> None:
         """
         Let the readers open IOHanldes after they are done with their potential raises.

diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py
@@ -25,7 +25,18 @@ def __init__(self, src: FilePathOrBuffer, **kwds):
         # open handles
         self._open_handles(src, kwds)
         assert self.handles is not None
-        for key in ("storage_options", "encoding", "memory_map", "compression"):
+
+        # Have to pass int, would break tests using TextReader directly otherwise :(
+        kwds["on_bad_lines"] = self.on_bad_lines.value
+
+        for key in (
+            "storage_options",
+            "encoding",
+            "memory_map",
+            "compression",
+            "error_bad_lines",
+            "warn_bad_lines",
+        ):
             kwds.pop(key, None)
         if self.handles.is_mmap and hasattr(self.handles.handle, "mmap"):
             # error: Item "IO[Any]" of "Union[IO[Any], RawIOBase, BufferedIOBase,
@@ -155,9 +166,6 @@ def _set_noconvert_columns(self):
         for col in noconvert_columns:
             self._reader.set_noconvert(col)
 
-    def set_error_bad_lines(self, status):
-        self._reader.set_error_bad_lines(int(status))
-
     def read(self, nrows=None):
         try:
             data = self._reader.read(nrows)

diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
@@ -75,9 +75,6 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds):
         self.quoting = kwds["quoting"]
         self.skip_blank_lines = kwds["skip_blank_lines"]
 
-        self.warn_bad_lines = kwds["warn_bad_lines"]
-        self.error_bad_lines = kwds["error_bad_lines"]
-
         self.names_passed = kwds["names"] or None
 
         self.has_index_names = False
@@ -664,10 +661,11 @@ def _next_line(self):
 
     def _alert_malformed(self, msg, row_num):
         """
-        Alert a user about a malformed row.
+        Alert a user about a malformed row, depending on value of
+        `self.on_bad_lines` enum.
 
-        If `self.error_bad_lines` is True, the alert will be `ParserError`.
-        If `self.warn_bad_lines` is True, the alert will be printed out.
+        If `self.on_bad_lines` is ERROR, the alert will be `ParserError`.
+        If `self.on_bad_lines` is WARN, the alert will be printed out.
 
         Parameters
         ----------
@@ -676,9 +674,9 @@ def _alert_malformed(self, msg, row_num):
                   Because this row number is displayed, we 1-index,
                   even though we 0-index internally.
         """
-        if self.error_bad_lines:
+        if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
             raise ParserError(msg)
-        elif self.warn_bad_lines:
+        elif self.on_bad_lines == self.BadLineHandleMethod.WARN:
             base = f"Skipping line {row_num}: "
             sys.stderr.write(base + msg + "\n")
 
@@ -699,7 +697,10 @@ def _next_iter_line(self, row_num):
             assert self.data is not None
             return next(self.data)
         except csv.Error as e:
-            if self.warn_bad_lines or self.error_bad_lines:
+            if (
+                self.on_bad_lines == self.BadLineHandleMethod.ERROR
+                or self.on_bad_lines == self.BadLineHandleMethod.WARN
+            ):
                 msg = str(e)
 
                 if "NULL byte" in msg or "line contains NUL" in msg:
@@ -896,11 +897,14 @@ def _rows_to_cols(self, content):
                 actual_len = len(l)
 
                 if actual_len > col_len:
-                    if self.error_bad_lines or self.warn_bad_lines:
+                    if (
+                        self.on_bad_lines == self.BadLineHandleMethod.ERROR
+                        or self.on_bad_lines == self.BadLineHandleMethod.WARN
+                    ):
                         row_num = self.pos - (content_len - i + footers)
                         bad_lines.append((row_num, actual_len))
 
-                        if self.error_bad_lines:
+                        if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
                             break
                 else:
                     content.append(l)

diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
@@ -325,9 +325,38 @@
     default cause an exception to be raised, and no DataFrame will be returned.
     If False, then these "bad lines" will be dropped from the DataFrame that is
     returned.
+
+    .. deprecated:: 1.3
+       The ``on_bad_lines`` parameter takes precedence over this parameter
+       when specified and should be used instead to specify behavior upon
+       encountering a bad line instead.
 warn_bad_lines : bool, default True
     If error_bad_lines is False, and warn_bad_lines is True, a warning for each
     "bad line" will be output.
+
+    .. deprecated:: 1.3
+       The ``on_bad_lines`` parameter takes precedence over this parameter
+       when specified and should be used instead to specify behavior upon
+       encountering a bad line instead.
+on_bad_lines : {{None, 'error', 'warn', 'skip'}}, default ``None``
+    Specifies what to do upon encountering a bad line (a line with too many fields).
+    Allowed values are :
+
+        - ``None``, default option, defer to ``error_bad_lines`` and ``warn_bad_lines``.
+
+          Note: This option is only present for backwards-compatibility reasons and will
+          be removed after the removal of ``error_bad_lines`` and ``warn_bad_lines``.
+          Please do not specify it explicitly.
+
+        - 'error', raise an Exception when a bad line is encountered.
+        - 'warn', raise a warning when a bad line is encountered and skip that line.
+        - 'skip', skip bad lines without raising or warning when they are encountered.
+
+    This parameter takes precedence over parameters
+    ``error_bad_lines`` and ``warn_bad_lines`` if specified.
+
+    .. versionadded:: 1.3
+
 delim_whitespace : bool, default False
     Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be
     used as the sep. Equivalent to setting ``sep='\\s+'``. If this option
@@ -382,6 +411,7 @@
     "memory_map": False,
     "error_bad_lines": True,
     "warn_bad_lines": True,
+    "on_bad_lines": None,
     "float_precision": None,
 }
 
@@ -390,8 +420,8 @@
 _c_unsupported = {"skipfooter"}
 _python_unsupported = {"low_memory", "float_precision"}
 
-_deprecated_defaults: Dict[str, Any] = {}
-_deprecated_args: Set[str] = set()
+_deprecated_defaults: Dict[str, Any] = {"error_bad_lines": True, "warn_bad_lines": True}
+_deprecated_args: Set[str] = {"error_bad_lines", "warn_bad_lines"}
 
 
 def validate_integer(name, val, min_val=0):
@@ -533,6 +563,8 @@ def read_csv(
     # Error Handling
     error_bad_lines=True,
     warn_bad_lines=True,
+    # TODO: disallow and change None to 'error' in on_bad_lines in 2.0
+    on_bad_lines=None,
     # Internal
     delim_whitespace=False,
     low_memory=_c_parser_defaults["low_memory"],
@@ -613,6 +645,8 @@ def read_table(
     # Error Handling
     error_bad_lines=True,
     warn_bad_lines=True,
+    # TODO: disallow and change None to 'error' in on_bad_lines in 2.0
+    on_bad_lines=None,
     encoding_errors: Optional[str] = "strict",
     # Internal
     delim_whitespace=False,
@@ -924,7 +958,7 @@ def _clean_options(self, options, engine):
                     f"The {arg} argument has been deprecated and will be "
                     "removed in a future version.\n\n"
                 )
-                warnings.warn(msg, FutureWarning, stacklevel=2)
+                warnings.warn(msg, FutureWarning, stacklevel=6)
             else:
                 result[arg] = parser_default