WIP: Code optimisation

gramener · Jun 29, 2022 · c479f08 · c479f08
2 parents fcc597a + 144a86a
commit c479f08
Show file tree

Hide file tree

Showing 53 changed files with 602 additions and 355 deletions.
diff --git a/.bandit b/.bandit
@@ -0,0 +1,5 @@
+[bandit]
+; Only test the Gramex source folder, not tests or testlib
+exclude = */tests/*,*/testlib/*,*/node_modules/*
+; B101:assert_used - assertions are used in test cases and are harmless in code
+skips = B101
diff --git a/.editorconfig b/.editorconfig
@@ -7,23 +7,11 @@ root = true
 
 # Apply common styles for most standard code files.
 # Do not apply to * - that covers binary files as well
-[*.{js,html,php,py,css,svg,json,less,yaml,yml,scss,xml,sh,java,bat,R}]
+[*.{js,html,php,css,svg,json,less,yaml,yml,scss,xml,sh,java,bat,R}]
 end_of_line = lf
 insert_final_newline = true
 trim_trailing_whitespace = true
 charset = utf-8
 # Stick to 2-space indenting by default, to conserve space
 indent_style = space
 indent_size = 2
-
-[*.py]
-indent_size = 4
-
-[Makefile]
-indent_style = tab
-indent_size = 4
-
-[testlib/test_config/config.empty.yaml]
-insert_final_newline = false
-[tests/dir/gramex.yaml]
-insert_final_newline = false
diff --git a/gramex/apps/admin/controlpanel.py b/gramex/apps/admin/controlpanel.py
@@ -117,10 +117,11 @@ def evaluate(handler, code):
     # Run code and get the result. (Result is None for exec)
     try:
         context = contexts.setdefault(handler.session['id'], {})
+        # B307:eval B102:exec_used is safe since only admin can run this
         if mode == 'eval':
-            result = eval(co, context)  # nosec: only admin can run this
+            result = eval(co, context)  # nosec B307
         else:
-            exec(co, context)           # nosec: only admin can run this
+            exec(co, context)           # nosec B102
             result = None
     except Exception as e:
         result = e

diff --git a/gramex/apps/admin2/gramexadmin.py b/gramex/apps/admin2/gramexadmin.py
@@ -149,10 +149,11 @@ def evaluate(handler, code):
     try:
         context = contexts.setdefault(handler.session['id'], {})
         context['handler'] = handler
+        # B307:eval B102:exec_used is safe since only admin can run this
         if mode == 'eval':
-            result = eval(co, context)  # nosec: only admin can run this
+            result = eval(co, context)  # nosec B307
         else:
-            exec(co, context)           # nosec: only admin can run this
+            exec(co, context)           # nosec B102
             result = None
     except Exception as e:
         result = e
@@ -200,12 +201,12 @@ def system_information(handler):
 
     from gramex.cache import Subprocess
     apps = {
-        # shell=True is safe here since the code is constructed entirely in this function
-        # We use shell to pick up the commands' paths from the shell.
-        ('node', 'version'): Subprocess('node --version', shell=True),  # nosec
-        ('npm', 'version'): Subprocess('npm --version', shell=True),    # nosec
-        ('yarn', 'version'): Subprocess('yarn --version', shell=True),  # nosec
-        ('git', 'version'): Subprocess('git --version', shell=True),    # nosec
+        # B602:any_other_function_with_shell_equals_true is safe here since the code is
+        # constructed entirely in this function. We use shell to pick up the commands' paths.
+        ('node', 'version'): Subprocess('node --version', shell=True),  # nosec 602
+        ('npm', 'version'): Subprocess('npm --version', shell=True),    # nosec 602
+        ('yarn', 'version'): Subprocess('yarn --version', shell=True),  # nosec 602
+        ('git', 'version'): Subprocess('git --version', shell=True),    # nosec 602
     }
     for key, proc in apps.items():
         stdout, stderr = yield proc.wait_for_exit()

diff --git a/gramex/apps/logviewer/gramex.yaml b/gramex/apps/logviewer/gramex.yaml
@@ -1,4 +1,5 @@
 # Configurable variables
+# LOGVIEWER_DB
 # LOGVIEWER_PATH_UI
 # LOGVIEWER_PATH_RENDER
 # $LOGVIEWER_FORMHANDLER_KWARGS
@@ -9,6 +10,9 @@
 # $LOGVIEWER_SCHEDULER_KWARGS
 
 variables:
+  LOGVIEWER_DB:
+    default:
+      url: sqlite:///$GRAMEXDATA/logs/logviewer.db
   LOGVIEWER_SCHEDULER_PORT:
     default: ''
   LOGVIEWER_PATH_UI:
@@ -149,6 +153,7 @@ schedule:
   apps/logviewer-$* if '--listen.port=' + LOGVIEWER_SCHEDULER_PORT in ''.join(sys.argv[1:]) or not LOGVIEWER_SCHEDULER_PORT:
     function: logviewer.summarize
     kwargs:
+      db: $LOGVIEWER_DB
       custom_dims:
         import.merge: $LOGVIEWER_CUSTOM_DIMENSIONS
       session_threshold: 15
@@ -187,6 +192,7 @@ schedule:
           op: NOTCONTAINS
           value: '\.js|\.css|\.ico|\.png|\.jpg|\.jpeg|\.gif|\.otf|\.woff.*|\.eot'
         as: uri_1
+      # TODO: this may not work as logviewer.summarize() does not accept any kwargs!
       import.merge: $LOGVIEWER_SCHEDULER_KWARGS
     startup: true
     # Run at 6pm local time. In India, this is a bit after 0:00 UTC,

diff --git a/gramex/apps/logviewer/logviewer.py b/gramex/apps/logviewer/logviewer.py
@@ -1,34 +1,37 @@
 import re
 import sys
 import os.path
-import sqlite3
 from glob import glob
-# lxml.etree is safe on https://github.com/tiran/defusedxml/tree/main/xmltestdata
-from lxml.etree import Element              # nosec: lxml is fixed
-from lxml.html import fromstring, tostring  # nosec: lxml is fixed
+# B410:import_lxml lxml.etree is safe on https://github.com/tiran/defusedxml/tree/main/xmltestdata
+from lxml.etree import Element              # nosec B410
+from lxml.html import fromstring, tostring  # nosec B410
 import numpy as np
 import pandas as pd
 import gramex.data
 import gramex.cache
 from gramex import conf
 from gramex.config import app_log
 from gramex.transforms import build_transform
+from typing import List
 
 if sys.version_info.major == 3:
     unicode = str
 
 DB_CONFIG = {
     'table': 'agg{}',
     'levels': ['M', 'W', 'D'],
-    'dimensions': [{'key': 'time', 'freq': '?level'},
-                   'user.id', 'ip', 'status', 'uri'],
+    'dimensions': [
+        {'key': 'time', 'freq': '?level'},
+        'user.id', 'ip', 'status', 'uri'
+    ],
     'metrics': {
         'duration': ['count', 'sum'],
         'new_session': ['sum'],
         'session_time': ['sum']
     }
 }
 
+# TODO: extra_columns should not be a global. Once instance may use multiple logviewers!
 extra_columns = []
 for key in conf.get('schedule', []):
     if 'kwargs' in conf.schedule[key] and 'custom_dims' in conf.schedule[key].kwargs:
@@ -61,12 +64,6 @@ def pdagg(df, groups, aggfuncs):
     return dff.reset_index()
 
 
-def table_exists(table, conn):
-    '''check if table exists in sqlite db'''
-    query = "SELECT name FROM sqlite_master WHERE type='table' AND name=?"
-    return not pd.read_sql(query, conn, params=[table]).empty
-
-
 def add_session(df, duration=30, cutoff_buffer=0):
     '''add new_session based on `duration` threshold
        add cutoff_buffer in minutes for first and last session requests
@@ -103,21 +100,28 @@ def prepare_logs(df, session_threshold=15, cutoff_buffer=0, custom_dims={}):
     return df
 
 
-def create_column_if_not_exists(table, freq, conn):
-    for col in extra_columns:
-        for row in conn.execute(f'PRAGMA table_info({table(freq)})'):
-            if row[1] == col:
-                break
-        else:
-            query = f'ALTER TABLE {table(freq)} ADD COLUMN "{col}" TEXT DEFAULT ""'
-            conn.execute(query)
-            conn.commit()
-
-
-def summarize(transforms=[], post_transforms=[], run=True,
-              session_threshold=15, cutoff_buffer=0, custom_dims=None):
-    '''summarize'''
-    app_log.info('logviewer: Summarize started')
+def summarize(
+        db: dict,
+        transforms: List[dict] = [],
+        post_transforms: List[dict] = [],
+        session_threshold: float = 15,
+        cutoff_buffer: float = 0,
+        custom_dims: dict = None) -> None:
+    '''Summarizes log files into a database periodically.
+
+    Parameters:
+        db: SQLAlchemy database configuration.
+        transforms: List of transforms to be applied on data.
+        post_transforms: List of post transforms to be applied on data.
+        session_threshold: Minimum threshold for the session.
+        cutoff_buffer: In minutes for first and last session requests.
+        custom_dims: Custom columns to be added to the logviewer.
+
+    This function is called by a scheduler and/or on start of gramex.
+    It will aggregate and update logs from requests.csv file by comparing the
+    timestamp of last added logs. It creates the aggregation tables if they don't exist.
+    '''
+    app_log.info('logviewer.summarize started')
     levels = DB_CONFIG['levels']
     table = DB_CONFIG['table'].format
     # dimensions and metrics to summarize
@@ -126,70 +130,52 @@ def summarize(transforms=[], post_transforms=[], run=True,
     log_file = conf.log.handlers.requests.filename
     # Handle for multiple instances requests.csv$LISTENPORT
     log_file = '{0}{1}'.format(*log_file.partition('.csv'))
-    folder = os.path.dirname(log_file)
-    conn = sqlite3.connect(os.path.join(folder, 'logviewer.db'))
 
-    for freq in levels:
-        try:
-            create_column_if_not_exists(table, freq, conn)
-        except sqlite3.OperationalError:
-            # Inform when table is created for the first time
-            app_log.info('logviewer: OperationalError: Table does not exist')
-
-    # drop agg tables from database
-    if run in ['drop', 'reload']:
-        droptable = 'DROP TABLE IF EXISTS {}'.format
-        for freq in levels:
-            app_log.info('logviewer: Dropping {} table'.format(table(freq)))
-            conn.execute(droptable(table(freq)))
-        conn.commit()
-        conn.execute('VACUUM')
-        if run == 'drop':
-            conn.close()
-            return
     # all log files sorted by modified time
     log_files = sorted(glob(log_file + '*'), key=os.path.getmtime)
-    max_date = None
 
     def filesince(filename, date):
         match = re.search(r'(\d{4}-\d{2}-\d{2})$', filename)
         backupdate = match.group() if match else ''
         return backupdate >= date or backupdate == ''
 
-    # get this month log files if db is already created
-    if table_exists(table(levels[-1]), conn):
-        query = 'SELECT MAX(time) FROM {}'.format(table(levels[-1]))    # nosec: table() is safe
-        max_date = pd.read_sql(query, conn).iloc[0, 0]
-        app_log.info(f'logviewer: last processed till {max_date}')
-        this_month = max_date[:8] + '01'
-        log_files = [f for f in log_files if filesince(f, this_month)]
+    # get most recent log files if db is already created
+    try:
+        log_filter = gramex.data.filter(**db, table=table(levels[-1]), args={})
+        max_date = log_filter.sort_values('time', ascending=False)['time'].iloc[0]
         max_date = pd.to_datetime(max_date)
+    except Exception:   # noqa
+        max_date = None
+    else:
+        app_log.info(f'logviewer.summarize: processing since {max_date}')
+        this_month = max_date.strftime('%Y-%m-01')
+        log_files = [f for f in log_files if filesince(f, this_month)]
 
     if not log_files:
-        app_log.info('logviewer: no log files to process')
+        app_log.info('logviewer.summarize: no log files to process')
         return
     # Create dataframe from log files
     columns = conf.log.handlers.requests['keys']
-    # TODO: avoid concat?
-    app_log.info(f'logviewer: files to process {log_files}')
+    app_log.info(f'logviewer.summarize: processing {log_files}')
     data = pd.concat([
         pd.read_csv(f, names=columns, encoding='utf-8').fillna('-')
         for f in log_files
     ], ignore_index=True)
     app_log.info(
-        'logviewer: prepare_logs {} rows with {} mint session_threshold'.format(
+        'logviewer.summarize: prepare_logs {} rows with session_threshold={}'.format(
             len(data.index), session_threshold))
     data = prepare_logs(df=data,
                         session_threshold=session_threshold,
                         cutoff_buffer=cutoff_buffer,
                         custom_dims=custom_dims)
-    app_log.info('logviewer: processed and returned {} rows'.format(len(data.index)))
+    app_log.info('logviewer.summarize: processed {} rows'.format(len(data.index)))
     # apply transforms on raw data
-    app_log.info('logviewer: applying transforms')
+    app_log.info('logviewer.summarize: applying transforms')
     for spec in transforms:
         apply_transform(data, spec)  # applies on copy
     # levels should go from M > W > D
     for freq in levels:
+        app_log.info('logviewer.summarize: aggregating {}'.format(table(freq)))
         # filter dataframe for max_date.level
         if max_date:
             date_from = max_date
@@ -199,29 +185,30 @@ def filesince(filename, date):
                 date_from -= pd.offsets.MonthBegin(1)
             data = data[data.time.ge(date_from)]
             # delete old records
-            query = f'DELETE FROM {table(freq)} WHERE time >= ?'    # nosec: table() is safe
-            conn.execute(query, (f'{date_from}',))
-            conn.commit()
+            gramex.data.delete(**db, table=table(freq), args={'time>~': [date_from]}, id=['time'])
         groups[0]['freq'] = freq
         # get summary view
-        app_log.info('logviewer: pdagg for {}'.format(table(freq)))
         dff = pdagg(data, groups, aggfuncs)
         # apply post_transforms here
-        app_log.info('logviewer: applying post_transforms')
         for spec in post_transforms:
             apply_transform(dff, spec)
         # insert new records
-        try:
-            dff.to_sql(table(freq), conn, if_exists='append', index=False)
-        # dff columns should match with table columns
-        # if not, call summarize run='reload' to
-        # drop all the tables and rerun the job
-        except sqlite3.OperationalError:
-            app_log.info('logviewer: OperationalError: run: reload')
-            summarize(transforms=transforms, run='reload')
-            return
-    conn.close()
-    app_log.info('logviewer: Summarize completed')
+        cols = {}
+        for col in dff.columns:
+            dt = dff[col].dtype.type
+            if pd.api.types.is_datetime64_any_dtype(dt):
+                cols[col] = 'DATETIME'
+            elif pd.api.types.is_bool_dtype(dt):
+                cols[col] = 'BOOLEAN'
+            elif pd.api.types.is_integer_dtype(dt):
+                cols[col] = 'INTEGER'
+            elif pd.api.types.is_numeric_dtype(dt):
+                cols[col] = 'REAL'
+            else:
+                cols[col] = 'TEXT'
+        gramex.data.alter(**db, table=table(freq), columns=cols)
+        gramex.data.insert(**db, table=table(freq), args=dff.to_dict())
+    app_log.info('logviewer.summarize: completed')
     return
 
 

diff --git a/gramex/apps/ui/__init__.py b/gramex/apps/ui/__init__.py
@@ -6,7 +6,8 @@
 import gramex
 import gramex.cache
 import string
-import subprocess       # nosec: only for JS compilation
+# B404:import_subprocess only for JS compilation
+import subprocess       # nosec B404
 from hashlib import md5
 from tornado.gen import coroutine, Return
 from functools import partial
@@ -29,7 +30,8 @@ def join(*args):
 
 def get_cache_key(state):
     cache_key = json.dumps(state, sort_keys=True, ensure_ascii=True).encode('utf-8')
-    return md5(cache_key).hexdigest()[:5]       # nosec: non-cryptographic use
+    # B303:md5 is safe here - it's not for cryptographic use
+    return md5(cache_key).hexdigest()[:5]       # nosec B303
 
 
 @coroutine

diff --git a/gramex/apps/ui/setup.js b/gramex/apps/ui/setup.js
@@ -41,14 +41,14 @@ fs.readdirSync(themes_guide_root).forEach(function (dir) {
       fs.readFileSync(theme_file, 'utf8')
         .replace('@import "bootstrap";', '@import "gramexui";')
         // Themes Guide disables grid classes. But we want to use them, so kill this line
-        .replace('$enable-grid-classes:false;\n', ''))
+        .replace('$enable-grid-classes:false;\n', '') + '\n')
     themes.push(`themes-guide/${dir}`)
   }
 })
 execSync('rm -rf bootstrap-themes', { cwd: tmp })
 
 // Save list of themes
-fs.writeFileSync('theme/themes.json', JSON.stringify({ 'themes': themes }))
+fs.writeFileSync('theme/themes.json', JSON.stringify({ 'themes': themes }) + '\n')
 
 
 // Utility functions

diff --git a/gramex/apps/ui/theme/themes-guide/blue_voltage.scss b/gramex/apps/ui/theme/themes-guide/blue_voltage.scss
@@ -24,4 +24,4 @@ $btn-border-radius-lg:1.6rem;
 $btn-border-radius-sm:.8rem;
 @import "gramexui";
 
-// Add SASS theme customizations here..
+// Add SASS theme customizations here..
diff --git a/gramex/apps/ui/theme/themes-guide/boldstrap.scss b/gramex/apps/ui/theme/themes-guide/boldstrap.scss
@@ -18,4 +18,4 @@ $dark:#3c4055;
 $body-bg:#efefef;
 @import "gramexui";
 
-// Add SASS theme customizations here..
+// Add SASS theme customizations here..