Skip to content

Commit

Permalink
Update models adding
Browse files Browse the repository at this point in the history
Add pdf source files support
  • Loading branch information
vnkrtv committed Mar 29, 2021
1 parent fd0b692 commit d0bf276
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 4 deletions.
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -8,6 +8,7 @@ markov/models/*.json
migrations/
instance
notebooks/.ipynb_checkpoints
notebooks/training_data
notebooks/NGrammMarkov2.0.ipynb
notebooks/mnogogramm.ipynb
__pycache__
Expand Down
6 changes: 5 additions & 1 deletion app/utils.py
Expand Up @@ -3,6 +3,7 @@
from typing import Generator, Iterable, Optional, List, Tuple, Dict, Any

import textract
import pdftotext
from textract.exceptions import ExtensionNotSupported
from werkzeug.datastructures import FileStorage
from werkzeug.utils import secure_filename
Expand Down Expand Up @@ -48,10 +49,13 @@ def get_text_corpus_from_postgres(request_dict: Dict[str, Any]) -> Iterable[str]

def get_text_corpus_from_file(request, filename: str) -> Iterable[str]:
file = request.files[filename]
if file.filename.endswith('.pdf'):
pdf = pdftotext.PDF(file)
return (page.replace('\n', ' ') for page in pdf)
filepath = os.path.join(app.instance_path, Config.TMP_DATA_FOLDER, file.filename)
file.save(filepath)
content = textract.process(filepath)
os.remove(filepath)
return (text.replace('\n', '') for text in content.decode().split('\n\n'))
return (text.replace('\n', ' ') for text in content.decode().split('\n\n'))

# def get_text_corpus_from_folder()
4 changes: 2 additions & 2 deletions notebooks/textextractor.ipynb
Expand Up @@ -172,7 +172,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 25,
"metadata": {},
"outputs": [
{
Expand All @@ -182,7 +182,7 @@
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-21-fe4f050d94b0>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;31m#extract text from the file\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0mcontent\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtextract\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprocess\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msource_directory\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprocess_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0;31m# We create and open the new and we prepare to write the Binary Data which is represented by the wb - Write Binary\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m<ipython-input-25-8efe96720a60>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;31m#extract text from the file\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0mcontent\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtextract\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprocess\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mPDF_PATH\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0;31m# We create and open the new and we prepare to write the Binary Data which is represented by the wb - Write Binary\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/textract/parsers/__init__.py\u001b[0m in \u001b[0;36mprocess\u001b[0;34m(filename, encoding, extension, **kwargs)\u001b[0m\n\u001b[1;32m 75\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 76\u001b[0m \u001b[0mparser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfiletype_module\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mParser\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 77\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mparser\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprocess\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencoding\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 78\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 79\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/textract/parsers/utils.py\u001b[0m in \u001b[0;36mprocess\u001b[0;34m(self, filename, encoding, **kwargs)\u001b[0m\n\u001b[1;32m 45\u001b[0m \u001b[0;31m# http://nedbatchelder.com/text/unipain/unipain.html#35\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 46\u001b[0m \u001b[0mbyte_string\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextract\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 47\u001b[0;31m \u001b[0municode_string\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbyte_string\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 48\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0municode_string\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencoding\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 49\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/textract/parsers/utils.py\u001b[0m in \u001b[0;36mdecode\u001b[0;34m(self, text)\u001b[0m\n\u001b[1;32m 63\u001b[0m \u001b[0;31m# use chardet to automatically detect the encoding text\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 64\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mchardet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdetect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 65\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mtext\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'encoding'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 66\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 67\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
Expand Down
28 changes: 27 additions & 1 deletion requirements.txt
@@ -1,30 +1,56 @@
alembic==1.4.3
argcomplete==1.10.0
beautifulsoup4==4.8.0
certifi==2020.12.5
cffi==1.14.5
chardet==3.0.4
click==7.1.2
cryptography==3.4.7
docx2txt==0.8
EbookLib==0.17.1
elasticsearch==7.11.0
extract-msg==0.23.1
Flask==1.1.2
Flask-Login==0.5.0
Flask-Migrate==2.5.3
Flask-SQLAlchemy==2.4.4
Flask-WTF==0.14.3
gunicorn==20.0.4
IMAPClient==2.1.0
itsdangerous==1.1.0
Jinja2==2.11.2
joblib==0.16.0
lxml==4.6.3
Mako==1.1.3
MarkupSafe==1.1.1
nltk==3.5
numpy==1.19.5
olefile==0.46
pdfminer.six==20181108
pdftotext==2.1.5
Pillow==8.1.2
pluginbase==1.0.0
psycopg2-binary==2.8.6
pycparser==2.20
pycryptodome==3.10.1
pymongo==3.11.0
python-dateutil==2.8.1
python-editor==1.0.4
python-pptx==0.6.18
pytz==2021.1
razdel==0.5.0
regex==2020.9.27
six==1.15.0
six==1.12.0
sortedcontainers==2.3.0
soupsieve==2.2.1
SpeechRecognition==3.8.1
SQLAlchemy==1.3.19
textract==1.6.3
tqdm==4.50.0
tzlocal==1.5.1
Unidecode==1.1.1
urllib3==1.22
Werkzeug==1.0.1
WTForms==2.3.3
xlrd==1.2.0
XlsxWriter==1.3.7

0 comments on commit d0bf276

Please sign in to comment.