Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add parsedatetime op #5417

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
68 changes: 68 additions & 0 deletions onnx/backend/test/case/node/parsedatetime.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# Copyright (c) ONNX Project Contributors
#
# SPDX-License-Identifier: Apache-2.0
from datetime import datetime

import numpy as np

import onnx
from onnx import numpy_helper

Check notice

Code scanning / CodeQL

Unused import Note

Import of 'numpy_helper' is not used.
from onnx.backend.test.case.base import Base
from onnx.backend.test.case.node import expect


class ParseDateTime(Base):
@staticmethod
def export_float_nan_default() -> None:
fmt = "%d/%m/%y %H:%M"
default = float("NaN")
node = onnx.helper.make_node(
"ParseDateTime",
inputs=["x"],
outputs=["y"],
format=fmt,
unit="s",
default=onnx.helper.make_tensor(
name="default",
data_type=onnx.TensorProto.DOUBLE,
dims=[],
vals=np.array(default),
),
)
x = np.array(["21/11/06 16:30", "foobar"], dtype=object)
y = []
for s in x:
try:
# datetime.timestamp() returns a float
y.append(datetime.strptime(s, fmt).timestamp())
except ValueError:
y.append(default)
expect(node, inputs=[x], outputs=[np.array(y)], name="test_parsedatetime")

@staticmethod
def export_int_default() -> None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we add a test with a 2D matrix? A tensor will null dimensions?

fmt = "%d/%m/%y %H:%M"
default = np.iinfo(np.int64).min
node = onnx.helper.make_node(
"ParseDateTime",
inputs=["x"],
outputs=["y"],
format=fmt,
unit="s",
default=onnx.helper.make_tensor(
name="default",
data_type=onnx.TensorProto.INT64,
dims=[],
vals=np.array(default),
),
)
x = np.array(["21/11/06 16:30", "foobar"], dtype=object)
y = []
for s in x:
try:
y.append(datetime.strptime(s, fmt).timestamp())
except ValueError:
y.append(default)
expect(
node, inputs=[x], outputs=[np.array(y, np.int64)], name="test_parsedatetime"
)
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
221/11/06 16:302foobarBx
Binary file not shown.
10 changes: 6 additions & 4 deletions onnx/defs/operator_sets.h
Original file line number Diff line number Diff line change
Expand Up @@ -1102,18 +1102,20 @@ class OpSet_Onnx_ver19 {
};

// Forward declarations for ai.onnx version 20
class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 20, GridSample);
class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 20, Gelu);
class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 20, ConstantOfShape);
class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 20, Gelu);
class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 20, GridSample);
class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 20, ParseDateTime);
class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 20, StringConcat);

// Iterate over schema from ai.onnx version 20
class OpSet_Onnx_ver20 {
public:
static void ForEachSchema(std::function<void(OpSchema&&)> fn) {
fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 20, GridSample)>());
fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 20, Gelu)>());
fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 20, ConstantOfShape)>());
fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 20, Gelu)>());
fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 20, GridSample)>());
fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 20, ParseDateTime)>());
fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 20, StringConcat)>());
}
};
Expand Down
38 changes: 38 additions & 0 deletions onnx/defs/text/defs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,44 @@
#include "onnx/defs/schema.h"

namespace ONNX_NAMESPACE {
static const char* ParseDateTime_doc = R"DOC(Parse a datetime string into a (floating point) Unix time stamp.)DOC";
ONNX_OPERATOR_SET_SCHEMA(
ParseDateTime,
20,
OpSchema()
.Input(0, "X", "Tensor with datetime strings", "T1", OpSchema::Single, true, 1, OpSchema::NonDifferentiable)
.Output(0, "y", "Unix time stamps", "T2", OpSchema::Single, true, 1, OpSchema::NonDifferentiable)
.Attr("format", "Format description in the syntax of C's `strptime`.", AttributeProto::STRING)
.Attr(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would make this unit optional as well and choose one for the default.

"unit",
"Unit of the returned time stamp. Allowed values are: 's' (second), 'ms' (millisecond), 'us' (microsecond) or 'ns' (nanosecond).",
AttributeProto::STRING)
.Attr(
"default",
"Default value to be used if the parsing fails. The tensor must be of rank 0 and either of type `tensor(int64)` or `tensor(double)`. The tensor type is the output type. If 'default' is specified, the output type is `tensor(int64)` and the behavior for failing to parse an input element is implementation defined.",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If 'default' is not specified...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any combination (unit, default) not possible? The documentation says what the default value type is but not the default value.

AttributeProto::TENSOR,
OPTIONAL_VALUE)

.TypeConstraint("T1", {"tensor(string)"}, "UTF-8 datetime strings")
.TypeConstraint("T2", {"tensor(double)", "tensor(int64)"}, "Output type depends on 'default' attribute.")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to support double here? In principle, users can encode it as Where( Equal(value, default), NaN, Cast(value, float)). Specifically, if we are not worried about nano-seconds (which strptime doesn't seem to support anyway), is the dynamic range of int64 not sufficient (sticking to microseconds)?

.SetDoc(ParseDateTime_doc)
.TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
auto* default_value = ctx.getAttribute("default");

if (hasInputShape(ctx, 0)) {
propagateShapeFromInputToOutput(ctx, 0, 0);
}

if (nullptr == default_value) {
updateOutputElemType(ctx, 0, TensorProto::INT64);
return;
} else {
const TensorProto& tensor_proto = default_value->t();
updateOutputElemType(ctx, 0, tensor_proto.data_type());
return;
}
}));

static const char* StringConcat_doc =
R"DOC(StringConcat concatenates string tensors elementwise (with NumPy-style broadcasting support))DOC";
ONNX_OPERATOR_SET_SCHEMA(
Expand Down
1 change: 1 addition & 0 deletions onnx/reference/ops/_op_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@
from onnx.reference.ops.op_optional_has_element import OptionalHasElement
from onnx.reference.ops.op_or import Or
from onnx.reference.ops.op_pad import Pad_1, Pad_2, Pad_11, Pad_18
from onnx.reference.ops.op_parsedatetime import ParseDateTime

Check notice

Code scanning / CodeQL

Unused import Note

Import of 'ParseDateTime' is not used.
from onnx.reference.ops.op_pow import Pow
from onnx.reference.ops.op_prelu import PRelu
from onnx.reference.ops.op_qlinear_conv import QLinearConv
Expand Down
22 changes: 22 additions & 0 deletions onnx/reference/ops/op_parsedatetime.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Copyright (c) ONNX Project Contributors

# SPDX-License-Identifier: Apache-2.0
# pylint: disable=W0221
from datetime import datetime

import numpy as np

from onnx.reference.op_run import OpRun


class ParseDateTime(OpRun):
def _run(self, x, format, unit, default=None): # type: ignore
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

format, unit, default are operatros attribute, they should be added as format=None, unit=None, default=None to distinguish them from the inputs. Class OpRun replaces them by the value stored in the node definition.

def parse(el):
try:
return datetime.strptime(el, format).timestamp()
except ValueError:
return np.nan
out = np.array([parse(el) for el in x])
out[np.isnan(out)] = default
out = out.astype(default.dtype) if default is not None else out.astype(np.int64)
return (out,)