Skip to content

Commit

Permalink
Updated extractText() according to changes proposed in issue py-pdf#17
Browse files Browse the repository at this point in the history
  • Loading branch information
Tom-Evers committed Mar 4, 2018
1 parent a4279cf commit 9217428
Showing 1 changed file with 10 additions and 2 deletions.
12 changes: 10 additions & 2 deletions PyPDF2/pdf.py
Expand Up @@ -2661,12 +2661,14 @@ def extractText(self):
# Note: we check all strings are TextStringObjects. ByteStringObjects
# are strings where the byte->string encoding was unknown, so adding
# them to the text here would be gibberish.
#
for operands, operator in content.operations:
if not operands: # Empty operands list contributes no text
operands = [""]
if operator == b_("Tj"):
_text = operands[0]
if isinstance(_text, TextStringObject):
text += _text
text += "\n"
elif operator == b_("T*"):
text += "\n"
elif operator == b_("'"):
Expand All @@ -2683,7 +2685,13 @@ def extractText(self):
for i in operands[0]:
if isinstance(i, TextStringObject):
text += i
text += "\n"
elif isinstance(i, FloatObject) or isinstance(i, NumberObject):
if text and (not text[-1] in " \n"):
text += " " * int(i / -600)
text += "\n"
elif operator == b_("TD") or operator == b_("Tm"):
if text and (not text[-1] in " \n"):
text += " "
return text

mediaBox = createRectangleAccessor("/MediaBox", ())
Expand Down

0 comments on commit 9217428

Please sign in to comment.