ENH: Add options to customize extractText() (#334)

These changes allow for an optional text separator for TJ and Tj operators. These source alterations were originally suggested in StackOverflow at: http://stackoverflow.com/questions/11017379/pypdf-ignores-newlines-in-pdf-file by DSM I'm just passing along the good suggestion in hopes that the change may become standard in some future version.
py-pdf · Apr 7, 2022 · 12c7047 · 12c7047
1 parent ba57659
commit 12c7047
Showing 1 changed file with 3 additions and 2 deletions.
diff --git a/PyPDF2/pdf.py b/PyPDF2/pdf.py
@@ -2656,7 +2656,7 @@ def compressContentStreams(self):
                 content = ContentStream(content, self.pdf)
             self[NameObject("/Contents")] = content.flateEncode()
 
-    def extractText(self):
+    def extractText(self, Tj_sep="", TJ_sep=" "):
         """
         Locate all text drawing commands, in the order they are provided in the
         content stream, and extract the text.  This works well for some PDF
@@ -2678,6 +2678,7 @@ def extractText(self):
             if operator == b_("Tj"):
                 _text = operands[0]
                 if isinstance(_text, TextStringObject):
+                    text += Tj_sep
                     text += _text
                     text += "\n"
             elif operator == b_("T*"):
@@ -2695,7 +2696,7 @@ def extractText(self):
             elif operator == b_("TJ"):
                 for i in operands[0]:
                     if isinstance(i, TextStringObject):
-                        text += " "
+                        text += TJ_sep
                         text += i
                 text += "\n"
         return text