Skip to content

Commit

Permalink
TST: Text extraction for non-latin alphabets (#954)
Browse files Browse the repository at this point in the history
See #591
  • Loading branch information
MartinThoma committed Jun 6, 2022
1 parent 2a1db78 commit babe32e
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 0 deletions.
Binary file added resources/hello-world.pdf
Binary file not shown.
20 changes: 20 additions & 0 deletions tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -671,3 +671,23 @@ def test_iss925():
if annots is not None:
for annot in annots:
annot.get_object()


@pytest.mark.xfail(reason="#591")
def test_extract_text_hello_world():
reader = PdfReader(os.path.join(RESOURCE_ROOT, "hello-world.pdf"))
text = reader.pages[0].extract_text().split("\n")
assert text == [
"English:",
"Hello World",
"Arabic:",
"مرحبا بالعالم",
"Russian:",
"Привет, мир",
"Chinese (traditional):",
"你好世界",
"Thai:",
"สวัสดีชาวโลก",
"Japanese:",
"こんにちは世界",
]

0 comments on commit babe32e

Please sign in to comment.