Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow Pathlike objects for file arguments #337

Merged
merged 7 commits into from Feb 12, 2022
19 changes: 19 additions & 0 deletions src/lxml/apihelpers.pxi
Expand Up @@ -1582,6 +1582,25 @@ cdef bint _isFilePath(const_xmlChar* c_path):
# assume it's a relative path
return REL_FILE_PATH

cdef object _getFSPathOrObject(object obj):
u"""
Get the __fspath__ attribute of an object
if it exists otherwise the original object
is returned
"""
if python.PY_VERSION_HEX >= 0x03060000:
try:
fspath = python.PY_FSPath(obj)
except TypeError:
return obj
return fspath
elif hasattr(obj, '__fspath__'):
fspath = obj.__fspath__
if callable(fspath):
return fspath()
raise ValueError('__fspath__ is not callable')
return obj
janssenhenning marked this conversation as resolved.
Show resolved Hide resolved

cdef object _encodeFilename(object filename):
u"""Make sure a filename is 8-bit encoded (or None).
"""
Expand Down
3 changes: 2 additions & 1 deletion src/lxml/dtd.pxi
Expand Up @@ -279,6 +279,7 @@ cdef class DTD(_Validator):
def __init__(self, file=None, *, external_id=None):
_Validator.__init__(self)
if file is not None:
file = _getFSPathOrObject(file)
if _isString(file):
file = _encodeFilename(file)
with self._error_log:
Expand All @@ -290,7 +291,7 @@ cdef class DTD(_Validator):
self._c_dtd = _parseDtdFromFilelike(file)
_reset_document_loader(orig_loader)
else:
raise DTDParseError, u"file must be a filename or file-like object"
raise DTDParseError, u"file must be a filename, file-like or path-like object"
elif external_id is not None:
with self._error_log:
orig_loader = _register_document_loader()
Expand Down
6 changes: 6 additions & 0 deletions src/lxml/includes/etree_defs.h
Expand Up @@ -247,6 +247,12 @@ long _ftol2( double dblSource ) { return _ftol( dblSource ); }
#define _isString(obj) (PyUnicode_Check(obj) || PyBytes_Check(obj))
#endif

#if PY_VERSION_HEX >= 0x03060000
#define PY_FSPath(obj) (PyOS_FSPath(obj))
#else
#define PY_FSPath(obj) (NULL)
#endif
janssenhenning marked this conversation as resolved.
Show resolved Hide resolved

#define _isElement(c_node) \
(((c_node)->type == XML_ELEMENT_NODE) || \
((c_node)->type == XML_COMMENT_NODE) || \
Expand Down
1 change: 1 addition & 0 deletions src/lxml/iterparse.pxi
Expand Up @@ -72,6 +72,7 @@ cdef class iterparse:
html=False, recover=None, huge_tree=False, collect_ids=True,
XMLSchema schema=None):
if not hasattr(source, 'read'):
source = _getFSPathOrObject(source)
self._filename = source
if python.IS_PYTHON2:
source = _encodeFilename(source)
Expand Down
1 change: 1 addition & 0 deletions src/lxml/parser.pxi
Expand Up @@ -1870,6 +1870,7 @@ cdef xmlNode* _copyNodeToDoc(xmlNode* c_node, xmlDoc* c_doc) except NULL:

cdef _Document _parseDocument(source, _BaseParser parser, base_url):
cdef _Document doc
source = _getFSPathOrObject(source)
if _isString(source):
# parse the file directly from the filesystem
doc = _parseDocumentFromURL(_encodeFilename(source), parser)
Expand Down
1 change: 1 addition & 0 deletions src/lxml/python.pxd
Expand Up @@ -127,6 +127,7 @@ cdef extern from "includes/etree_defs.h": # redefines some functions as macros
cdef bint IS_PYTHON2
cdef bint IS_PYTHON3 # legacy, avoid
cdef bint IS_PYPY
cdef object PY_FSPath(object obj)
janssenhenning marked this conversation as resolved.
Show resolved Hide resolved

cdef extern from "lxml_endian.h":
cdef bint PY_BIG_ENDIAN # defined in later Py3.x versions
4 changes: 4 additions & 0 deletions src/lxml/serializer.pxi
Expand Up @@ -627,6 +627,7 @@ cdef object _open_utf8_file

@contextmanager
def _open_utf8_file(file, compression=0):
file = _getFSPathOrObject(file)
if _isString(file):
if compression:
with gzip.GzipFile(file, mode='wb', compresslevel=compression) as zf:
Expand Down Expand Up @@ -723,6 +724,7 @@ cdef _tofilelike(f, _Element element, encoding, doctype, method,
with GzipFile(fileobj=bytes_out, mode='wb', compresslevel=compression) as gzip_file:
gzip_file.write(data)
data = bytes_out.getvalue()
f = _getFSPathOrObject(f)
if _isString(f):
filename8 = _encodeFilename(f)
with open(filename8, 'wb') as f:
Expand Down Expand Up @@ -787,6 +789,7 @@ cdef _FilelikeWriter _create_output_buffer(
raise LookupError(
f"unknown encoding: '{c_enc.decode('UTF-8') if c_enc is not NULL else u''}'")
try:
f = _getFSPathOrObject(f)
if _isString(f):
filename8 = _encodeFilename(f)
if b'%' in filename8 and (
Expand Down Expand Up @@ -852,6 +855,7 @@ cdef _tofilelikeC14N(f, _Element element, bint exclusive, bint with_comments,
_convert_ns_prefixes(c_doc.dict, inclusive_ns_prefixes)
if inclusive_ns_prefixes else NULL)

f = _getFSPathOrObject(f)
if _isString(f):
filename8 = _encodeFilename(f)
c_filename = _cstr(filename8)
Expand Down
14 changes: 14 additions & 0 deletions src/lxml/tests/test_dtd.py
Expand Up @@ -24,6 +24,20 @@ def test_dtd_file(self):

dtd = etree.DTD(fileInTestDir("test.dtd"))
self.assertTrue(dtd.validate(root))

def test_dtd_file_pathlike(self):
parse = etree.parse
tree = parse(fileInTestDir("test.xml"))
root = tree.getroot()

class Path(object):
def __init__(self,path):
self.path = path
def __fspath__(self):
return self.path
janssenhenning marked this conversation as resolved.
Show resolved Hide resolved

dtd = etree.DTD(Path(fileInTestDir("test.dtd")))
self.assertTrue(dtd.validate(root))

def test_dtd_stringio(self):
root = etree.XML(_bytes("<b/>"))
Expand Down
87 changes: 87 additions & 0 deletions src/lxml/tests/test_etree.py
Expand Up @@ -4599,6 +4599,34 @@ def test_proxy_collect_siblings_text(self):
self.assertEqual('child1', c2.getprevious().tag)
self.assertEqual('abc', c2.getprevious().tail)

def test_parse_source_pathlike(self):
etree = self.etree
tounicode = self.etree.tounicode

class Path(object):
def __init__(self, path):
self.path = path

def __fspath__(self):
return self.path

tree = etree.parse(Path(fileInTestDir('test.xml')))
self.assertEqual(_bytes('<a><b></b></a>'),
canonicalize(tounicode(tree)))

def test_iterparse_source_pathlike(self):
iterparse = self.etree.iterparse

class Path(object):
def __init__(self, path):
self.path = path

def __fspath__(self):
return self.path

events = list(iterparse(Path(fileInTestDir('test.xml'))))
self.assertEqual(2, len(events))

# helper methods

def _writeElement(self, element, encoding='us-ascii', compression=0):
Expand Down Expand Up @@ -4883,6 +4911,20 @@ def test_c14n_file(self):
data = read_file(filename, 'rb')
self.assertEqual(_bytes('<a><b></b></a>'),
data)

def test_c14n_file_pathlike(self):
class Path(object):
def __init__(self,path):
self.path = path
def __fspath__(self):
return self.path

tree = self.parse(_bytes('<a><b/></a>'))
with tmpfile() as filename:
tree.write_c14n(Path(filename))
data = read_file(filename, 'rb')
self.assertEqual(_bytes('<a><b></b></a>'),
data)

def test_c14n_file_gzip(self):
tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
Expand All @@ -4892,6 +4934,21 @@ def test_c14n_file_gzip(self):
data = f.read()
self.assertEqual(_bytes('<a>'+'<b></b>'*200+'</a>'),
data)

def test_c14n_file_gzip_pathlike(self):
class Path(object):
def __init__(self,path):
self.path = path
def __fspath__(self):
return self.path

tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
with tmpfile() as filename:
tree.write_c14n(Path(filename), compression=9)
with gzip.open(filename, 'rb') as f:
data = f.read()
self.assertEqual(_bytes('<a>'+'<b></b>'*200+'</a>'),
data)

def test_c14n2_file_gzip(self):
tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
Expand Down Expand Up @@ -5182,6 +5239,20 @@ def test_write_file(self):
data = read_file(filename, 'rb')
self.assertEqual(_bytes('<a><b/></a>'),
data)

def test_write_file_pathlike(self):
class Path(object):
def __init__(self,path):
self.path = path
def __fspath__(self):
return self.path

tree = self.parse(_bytes('<a><b/></a>'))
with tmpfile() as filename:
tree.write(Path(filename))
data = read_file(filename, 'rb')
self.assertEqual(_bytes('<a><b/></a>'),
data)

def test_write_file_gzip(self):
tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
Expand All @@ -5192,6 +5263,22 @@ def test_write_file_gzip(self):
self.assertEqual(_bytes('<a>'+'<b/>'*200+'</a>'),
data)

def test_write_file_gzip_pathlike(self):

class Path(object):
def __init__(self,path):
self.path = path
def __fspath__(self):
return self.path

tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
with tmpfile() as filename:
tree.write(Path(filename), compression=9)
with gzip.open(filename, 'rb') as f:
data = f.read()
self.assertEqual(_bytes('<a>'+'<b/>'*200+'</a>'),
data)

def test_write_file_gzip_parse(self):
tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
with tmpfile() as filename:
Expand Down
12 changes: 12 additions & 0 deletions src/lxml/tests/test_xmlschema.py
Expand Up @@ -387,6 +387,18 @@ def test_create_from_partial_doc(self):
etree.XMLSchema(schema_element)
etree.XMLSchema(schema_element)

def test_xmlschema_pathlike(self):
class Path(object):
def __init__(self, path):
self.path = path

def __fspath__(self):
return self.path

schema = etree.XMLSchema(file=Path(fileInTestDir('test.xsd')))
tree_valid = self.parse('<a><b></b></a>')
self.assertTrue(schema.validate(tree_valid))


class ETreeXMLSchemaResolversTestCase(HelperTestCase):
resolver_schema_int = BytesIO("""\
Expand Down
19 changes: 19 additions & 0 deletions src/lxml/tests/test_xslt.py
Expand Up @@ -195,6 +195,25 @@ def test_xslt_write_output_file_path(self):
res[0] = f.read().decode("UTF-16")
finally:
os.unlink(f.name)

def test_xslt_write_output_file_pathlike(self):
class Path(object):
def __init__(self,path):
self.path = path
def __fspath__(self):
return self.path

with self._xslt_setup() as res:
f = NamedTemporaryFile(delete=False)
try:
try:
res[0].write_output(Path(f.name), compression=9)
finally:
f.close()
with gzip.GzipFile(f.name) as f:
res[0] = f.read().decode("UTF-16")
finally:
os.unlink(f.name)

def test_xslt_write_output_file_path_urlescaped(self):
# libxml2 should not unescape file paths.
Expand Down
1 change: 1 addition & 0 deletions src/lxml/xmlschema.pxi
Expand Up @@ -56,6 +56,7 @@ cdef class XMLSchema(_Validator):
self._doc = _documentFactory(c_doc, doc._parser)
parser_ctxt = xmlschema.xmlSchemaNewDocParserCtxt(c_doc)
elif file is not None:
file = _getFSPathOrObject(file)
if _isString(file):
filename = _encodeFilename(file)
parser_ctxt = xmlschema.xmlSchemaNewParserCtxt(_cstr(filename))
Expand Down