TST: Text extraction for non-latin alphabets (#954)

See #591
py-pdf · Jun 6, 2022 · babe32e · babe32e
1 parent 2a1db78
commit babe32e
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 0 deletions.
diff --git a/resources/hello-world.pdf b/resources/hello-world.pdf
diff --git a/tests/test_reader.py b/tests/test_reader.py
@@ -671,3 +671,23 @@ def test_iss925():
         if annots is not None:
             for annot in annots:
                 annot.get_object()
+
+
+@pytest.mark.xfail(reason="#591")
+def test_extract_text_hello_world():
+    reader = PdfReader(os.path.join(RESOURCE_ROOT, "hello-world.pdf"))
+    text = reader.pages[0].extract_text().split("\n")
+    assert text == [
+        "English:",
+        "Hello World",
+        "Arabic:",
+        "مرحبا بالعالم",
+        "Russian:",
+        "Привет, мир",
+        "Chinese (traditional):",
+        "你好世界",
+        "Thai:",
+        "สวัสดีชาวโลก",
+        "Japanese:",
+        "こんにちは世界",
+    ]