Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: pd.read_json ability to skip over first line when lines=True #58049 #58412

Closed
wants to merge 1 commit into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
11 changes: 11 additions & 0 deletions pandas/io/json/_json.py
Expand Up @@ -511,6 +511,7 @@ def read_json(
storage_options: StorageOptions | None = None,
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
engine: JSONEngine = "ujson",
skiprows: int | list[int] | Callable[[int], bool] = None,
) -> DataFrame | Series | JsonReader:
"""
Convert a JSON string to pandas object.
Expand Down Expand Up @@ -829,6 +830,7 @@ def __init__(
encoding_errors: str | None = "strict",
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
engine: JSONEngine = "ujson",
skiprows: int | list[int] | Callable[[int], bool] = None,
) -> None:
self.orient = orient
self.typ = typ
Expand All @@ -849,6 +851,7 @@ def __init__(
self.encoding_errors = encoding_errors
self.handles: IOHandles[str] | None = None
self.dtype_backend = dtype_backend
self.skiprows = skiprows

if self.engine not in {"pyarrow", "ujson"}:
raise ValueError(
Expand Down Expand Up @@ -1021,11 +1024,19 @@ def __next__(self) -> DataFrame | Series:
self.close()
raise StopIteration

if isinstance(self.skiprows, int):
for _ in range(self.skiprows):
next(self.data)
self.skiprows = None

lines = list(islice(self.data, self.chunksize))
if not lines:
self.close()
raise StopIteration

if callable(self.skiprows):
lines = [line for i, line in enumerate(lines) if not self.skiprows(i)]

try:
lines_json = self._combine_lines(lines)
obj = self._get_object_parser(lines_json)
Expand Down