Update models adding

Add pdf source files support
vnkrtv · Mar 29, 2021 · d0bf276 · d0bf276
1 parent fd0b692
commit d0bf276
Show file tree

Hide file tree

Showing 4 changed files with 35 additions and 4 deletions.
diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,7 @@ markov/models/*.json
 migrations/
 instance
 notebooks/.ipynb_checkpoints
+notebooks/training_data
 notebooks/NGrammMarkov2.0.ipynb
 notebooks/mnogogramm.ipynb
 __pycache__

diff --git a/app/utils.py b/app/utils.py
@@ -3,6 +3,7 @@
 from typing import Generator, Iterable, Optional, List, Tuple, Dict, Any
 
 import textract
+import pdftotext
 from textract.exceptions import ExtensionNotSupported
 from werkzeug.datastructures import FileStorage
 from werkzeug.utils import secure_filename
@@ -48,10 +49,13 @@ def get_text_corpus_from_postgres(request_dict: Dict[str, Any]) -> Iterable[str]
 
 def get_text_corpus_from_file(request, filename: str) -> Iterable[str]:
     file = request.files[filename]
+    if file.filename.endswith('.pdf'):
+        pdf = pdftotext.PDF(file)
+        return (page.replace('\n', ' ') for page in pdf)
     filepath = os.path.join(app.instance_path, Config.TMP_DATA_FOLDER, file.filename)
     file.save(filepath)
     content = textract.process(filepath)
     os.remove(filepath)
-    return (text.replace('\n', '') for text in content.decode().split('\n\n'))
+    return (text.replace('\n', ' ') for text in content.decode().split('\n\n'))
 
 # def get_text_corpus_from_folder()
diff --git a/notebooks/textextractor.ipynb b/notebooks/textextractor.ipynb
@@ -172,7 +172,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [
     {
@@ -182,7 +182,7 @@
      "traceback": [
       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
       "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-21-fe4f050d94b0>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      9\u001b[0m     \u001b[0;31m#extract text from the file\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m     \u001b[0mcontent\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtextract\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprocess\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msource_directory\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprocess_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     12\u001b[0m     \u001b[0;31m# We create and open the new and we prepare to write the Binary Data which is represented by the wb - Write Binary\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m<ipython-input-25-8efe96720a60>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      9\u001b[0m     \u001b[0;31m#extract text from the file\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m     \u001b[0mcontent\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtextract\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprocess\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mPDF_PATH\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     12\u001b[0m     \u001b[0;31m# We create and open the new and we prepare to write the Binary Data which is represented by the wb - Write Binary\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
       "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/textract/parsers/__init__.py\u001b[0m in \u001b[0;36mprocess\u001b[0;34m(filename, encoding, extension, **kwargs)\u001b[0m\n\u001b[1;32m     75\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     76\u001b[0m     \u001b[0mparser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfiletype_module\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mParser\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 77\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0mparser\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprocess\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencoding\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     78\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     79\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
       "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/textract/parsers/utils.py\u001b[0m in \u001b[0;36mprocess\u001b[0;34m(self, filename, encoding, **kwargs)\u001b[0m\n\u001b[1;32m     45\u001b[0m         \u001b[0;31m# http://nedbatchelder.com/text/unipain/unipain.html#35\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     46\u001b[0m         \u001b[0mbyte_string\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextract\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 47\u001b[0;31m         \u001b[0municode_string\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbyte_string\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     48\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0municode_string\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencoding\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     49\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
       "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/textract/parsers/utils.py\u001b[0m in \u001b[0;36mdecode\u001b[0;34m(self, text)\u001b[0m\n\u001b[1;32m     63\u001b[0m         \u001b[0;31m# use chardet to automatically detect the encoding text\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     64\u001b[0m         \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mchardet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdetect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 65\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mtext\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'encoding'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     66\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     67\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",

diff --git a/requirements.txt b/requirements.txt
@@ -1,30 +1,56 @@
 alembic==1.4.3
+argcomplete==1.10.0
+beautifulsoup4==4.8.0
 certifi==2020.12.5
+cffi==1.14.5
+chardet==3.0.4
 click==7.1.2
+cryptography==3.4.7
+docx2txt==0.8
+EbookLib==0.17.1
 elasticsearch==7.11.0
+extract-msg==0.23.1
 Flask==1.1.2
 Flask-Login==0.5.0
 Flask-Migrate==2.5.3
 Flask-SQLAlchemy==2.4.4
 Flask-WTF==0.14.3
 gunicorn==20.0.4
+IMAPClient==2.1.0
 itsdangerous==1.1.0
 Jinja2==2.11.2
 joblib==0.16.0
+lxml==4.6.3
 Mako==1.1.3
 MarkupSafe==1.1.1
 nltk==3.5
 numpy==1.19.5
+olefile==0.46
+pdfminer.six==20181108
+pdftotext==2.1.5
+Pillow==8.1.2
+pluginbase==1.0.0
 psycopg2-binary==2.8.6
+pycparser==2.20
+pycryptodome==3.10.1
 pymongo==3.11.0
 python-dateutil==2.8.1
 python-editor==1.0.4
+python-pptx==0.6.18
+pytz==2021.1
 razdel==0.5.0
 regex==2020.9.27
-six==1.15.0
+six==1.12.0
+sortedcontainers==2.3.0
+soupsieve==2.2.1
+SpeechRecognition==3.8.1
 SQLAlchemy==1.3.19
+textract==1.6.3
 tqdm==4.50.0
+tzlocal==1.5.1
 Unidecode==1.1.1
 urllib3==1.22
 Werkzeug==1.0.1
 WTForms==2.3.3
+xlrd==1.2.0
+XlsxWriter==1.3.7