From d29d0bf4403d729077f08c38efd0a0a1e0a5c8c9 Mon Sep 17 00:00:00 2001
From: systemcatch <30196510+systemcatch@users.noreply.github.com>
Date: Wed, 28 Nov 2018 17:11:54 +0000
Subject: [PATCH 1/2] Allow fetching of historical data for CL-SIC

- Handle old format xls files correctly.
- Read only the correct sheet from xls file.
- Add geothermal category.
- Avoid error if no generation in hour for fuel type.
- Add new plants.
---
 parsers/CL_SIC.py | 140 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 97 insertions(+), 43 deletions(-)

diff --git a/parsers/CL_SIC.py b/parsers/CL_SIC.py
index c67e87a582..0b6d06f2a2 100644
--- a/parsers/CL_SIC.py
+++ b/parsers/CL_SIC.py
@@ -6,12 +6,13 @@
 from bs4 import BeautifulSoup
 from collections import defaultdict
 from datetime import datetime
+import logging
 import pandas as pd
 import re
 import requests
 
 
-thermal_plants = {
+THERMAL_PLANTS = {
   "Taltal 2 GNL": "gas",
   "Taltal 2": "gas",
   "Taltal 2 Diesel": "oil",
@@ -210,36 +211,66 @@
   "HBS GNL": "gas",
   "Rey": "oil",
   "El Nogal": "oil",
-  "Lepanto": "biomass"
+  "Lepanto": "biomass",
+  "Sta Fe": "unknown",
+  "Collipulli": "unknown",
+  "Totoral": "unknown",
+  "Ancali": "unknown",
+  "Nueva Renca": "gas",
+  "Laja CMPC": "biomass",
+  "Curanilahue": "coal",
+  "Cabrero": "biomass",
+  "Curacautin": "geothermal",
+  "D. Almagro": "gas",
+  "Concon": "gas",
+  "Lautaro": "biomass",
+  "Degan": "oil"
 }
 
 
-def get_xls_data(session = None):
+def get_xls_data(target_datetime = None, session = None):
     """Finds and reads .xls file from url into a pandas dataframe."""
 
-    s = session or requests.Session()
-    document_url = 'https://sic.coordinador.cl/informes-y-documentos/fichas/operacion-real/'
-    req = s.get(document_url)
-    soup = BeautifulSoup(req.text, 'html.parser')
-
-    # Find the latest file.
-    generation_link = soup.find("a", {"title": "Descargar archivo"})
-    extension = generation_link["href"]
-    base_url = "https://sic.coordinador.cl"
-    data_url = base_url + extension
-
-    date_pattern = r'OP(\d+)\.xls'
-    date_str = re.search(date_pattern, extension).group(1)
-
-    date_no_tz = arrow.get(date_str, "YYMMDD")
-    date = date_no_tz.replace(tzinfo='Chile/Continental')
-
+    if not target_datetime:
+        s = session or requests.Session()
+        document_url = 'https://sic.coordinador.cl/informes-y-documentos/fichas/operacion-real/'
+        req = s.get(document_url)
+        soup = BeautifulSoup(req.text, 'html.parser')
+
+        # Find the latest file.
+        generation_link = soup.find("a", {"title": "Descargar archivo"})
+        extension = generation_link["href"]
+        base_url = "https://sic.coordinador.cl"
+        data_url = base_url + extension
+
+        date_pattern = r'OP(\d+)\.xls'
+        date_str = re.search(date_pattern, extension).group(1)
+        date_no_tz = arrow.get(date_str, "YYMMDD")
+        date = date_no_tz.replace(tzinfo='Chile/Continental')
+    else:
+        lookup_date = target_datetime.format('YYMMDD')
+        year = target_datetime.format('YY')
+        data_url = 'https://sic.coordinador.cl/wp-content/uploads/estadisticas/operdiar/{0}/OP{1}.xls'.format(year, lookup_date)
+        date = target_datetime.replace(tzinfo='Chile/Continental')
+
+    # Multiple tables in first excel sheet, only top one is needed.
     col_names = ['Plants'] + list(range(1,24)) + [0]
-    df = pd.read_excel(data_url, skiprows=[0,1,2,3], header=None, index_col=0, skip_footer=300, usecols=25, names=col_names)
+    df = pd.read_excel(data_url, skiprows=[0,1,2], header=None, index_col=0, sheet_name="gen_real", usecols=25, names=col_names)
     df = df.reset_index(drop=True)
     df = df.set_index("Plants")
 
-    return df, date
+    OLD_FORMAT = False
+    # Table layout changed in January 2016, old format will cause total processing to fail.
+    try:
+        df_end = df.index.get_loc('Eólico')
+    except KeyError:
+        df_end = df.index.get_loc('Total Generación SIC')
+        OLD_FORMAT = True
+
+    # Remove unneeded rows.
+    df = df.iloc[:df_end+1]
+
+    return df, date, OLD_FORMAT
 
 
 def combine_generating_units(generation, gen_vals):
@@ -266,7 +297,7 @@ def thermal_processer(df, logger):
 
     # Log any new plants that have been added.
     data_plants = list(thermal_df.index)
-    map_plants = list(thermal_plants.keys())
+    map_plants = list(THERMAL_PLANTS.keys())
     unmapped = list(set(data_plants) - set(map_plants))
 
     for plant in unmapped:
@@ -276,11 +307,17 @@ def thermal_processer(df, logger):
     gas_generation = []
     oil_generation = []
     biomass_generation = []
+    geothermal_generation = []
     unknown_generation = []
 
-    for plant in thermal_plants.keys():
-        plant_vals = thermal_df.loc[plant].to_dict()
-        plant_type = thermal_plants[plant]
+    for plant in THERMAL_PLANTS.keys():
+        try:
+            plant_vals = thermal_df.loc[plant].to_dict()
+        except KeyError:
+            # plant is missing from df
+            continue
+
+        plant_type = THERMAL_PLANTS[plant]
         if plant_type == 'coal':
             coal_generation.append(plant_vals)
         elif plant_type == 'gas':
@@ -289,6 +326,8 @@ def thermal_processer(df, logger):
             oil_generation.append(plant_vals)
         elif plant_type == 'biomass':
             biomass_generation.append(plant_vals)
+        elif plant_type == 'geothermal':
+            geothermal_generation.append(plant_vals)
         else:
             unknown_generation.append(plant_vals)
 
@@ -296,18 +335,20 @@ def thermal_processer(df, logger):
     gas_vals = defaultdict(lambda: 0.0)
     oil_vals = defaultdict(lambda: 0.0)
     biomass_vals = defaultdict(lambda: 0.0)
+    geothermal_vals = defaultdict(lambda: 0.0)
     unknown_vals = defaultdict(lambda: 0.0)
 
     coal = combine_generating_units(coal_generation, coal_vals)
     gas = combine_generating_units(gas_generation, gas_vals)
     oil = combine_generating_units(oil_generation, oil_vals)
     biomass = combine_generating_units(biomass_generation, biomass_vals)
+    geothermal = combine_generating_units(geothermal_generation, geothermal_vals)
     unknown = combine_generating_units(unknown_generation, unknown_vals)
 
-    return coal, gas, oil, biomass, unknown
+    return coal, gas, oil, biomass, geothermal, unknown
 
 
-def data_processer(df, date, logger):
+def data_processer(df, date, old_format, logger):
     """
     Extracts aggregated data for hydro, solar and wind from dataframe.
     Combines with thermal data and an arrow object timestamp.
@@ -319,13 +360,23 @@ def data_processer(df, date, logger):
     gas_vals = thermal_generation[1]
     oil_vals = thermal_generation[2]
     biomass_vals = thermal_generation[3]
-    unknown_vals = thermal_generation[4]
+    geothermal_vals = thermal_generation[4]
+    unknown_vals = thermal_generation[5]
 
     total = df.loc['Total Generación SIC']
 
-    hydro = df.loc['Hidroeléctrico'].to_dict()
-    solar = df.loc['Solar'].to_dict()
-    wind = df.loc['Eólico'].to_dict()
+    if old_format==True:
+        solar = df.loc['Solares'].to_dict()
+        wind = df.loc['Eólicas'].to_dict()
+
+        hydro_running = df.loc['Pasada'].to_dict()
+        hydro_dam = df.loc['Embalse'].to_dict()
+        hydro_joined = defaultdict(lambda: 0.0)
+        hydro = combine_generating_units([hydro_running, hydro_dam], hydro_joined)
+    else:
+        hydro = df.loc['Hidroeléctrico'].to_dict()
+        solar = df.loc['Solar'].to_dict()
+        wind = df.loc['Eólico'].to_dict()
 
     hydro_vals = {k: hydro[k]*total[k] for k in hydro}
     solar_vals = {k: solar[k]*total[k] for k in solar}
@@ -334,14 +385,15 @@ def data_processer(df, date, logger):
     generation_by_hour = []
     for hour in range(0,24):
         production = {}
-        production['hydro'] = hydro_vals[hour]
-        production['wind'] = wind_vals[hour]
-        production['solar'] = solar_vals[hour]
-        production['coal'] = coal_vals[hour]
-        production['gas'] = gas_vals[hour]
-        production['oil'] = oil_vals[hour]
-        production['biomass'] = biomass_vals[hour]
-        production['unknown'] = unknown_vals[hour]
+        production['hydro'] = hydro_vals.get(hour, 0.0)
+        production['wind'] = wind_vals.get(hour, 0.0)
+        production['solar'] = solar_vals.get(hour, 0.0)
+        production['coal'] = coal_vals.get(hour, 0.0)
+        production['gas'] = gas_vals.get(hour, 0.0)
+        production['oil'] = oil_vals.get(hour, 0.0)
+        production['biomass'] = biomass_vals.get(hour, 0.0)
+        production['geothermal'] = geothermal_vals.get(hour, 0.0)
+        production['unknown'] = unknown_vals.get(hour, 0.0)
 
         if hour == 0:
             # Midnight data is for a new day.
@@ -354,7 +406,7 @@ def data_processer(df, date, logger):
     return generation_by_hour
 
 
-def fetch_production(zone_key = 'CL-SIC', session=None, target_datetime=None, logger=None):
+def fetch_production(zone_key = 'CL-SIC', session=None, target_datetime=None, logger=logging.getLogger(__name__)):
     """
     Requests the last known production mix (in MW) of a given country
     Arguments:
@@ -384,8 +436,8 @@ def fetch_production(zone_key = 'CL-SIC', session=None, target_datetime=None, lo
     }
     """
 
-    gxd = get_xls_data(session = None)
-    processing = data_processer(gxd[0], gxd[1], logger)
+    gxd = get_xls_data(target_datetime = target_datetime, session = None)
+    processing = data_processer(gxd[0], gxd[1], gxd[2], logger)
 
     data_by_hour = []
     for processed_data in processing:
@@ -410,3 +462,5 @@ def fetch_production(zone_key = 'CL-SIC', session=None, target_datetime=None, lo
 
     print('fetch_production() ->')
     print(fetch_production())
+    #print('fetch_production(target_datetime=2016-01-01)')
+    #print(fetch_production(target_datetime=arrow.get('2016-01-01')))

From 005f330f79421e3e73764ba6cb656e7ea9a961fa Mon Sep 17 00:00:00 2001
From: systemcatch <30196510+systemcatch@users.noreply.github.com>
Date: Thu, 29 Nov 2018 10:23:45 +0000
Subject: [PATCH 2/2] Suggested changes and fix multiplication error

---
 parsers/CL_SIC.py | 32 ++++++++++++++------------------
 1 file changed, 14 insertions(+), 18 deletions(-)

diff --git a/parsers/CL_SIC.py b/parsers/CL_SIC.py
index 0b6d06f2a2..d4c2f0771a 100644
--- a/parsers/CL_SIC.py
+++ b/parsers/CL_SIC.py
@@ -248,6 +248,7 @@ def get_xls_data(target_datetime = None, session = None):
         date_no_tz = arrow.get(date_str, "YYMMDD")
         date = date_no_tz.replace(tzinfo='Chile/Continental')
     else:
+        target_datetime = arrow.get(target_datetime)
         lookup_date = target_datetime.format('YYMMDD')
         year = target_datetime.format('YY')
         data_url = 'https://sic.coordinador.cl/wp-content/uploads/estadisticas/operdiar/{0}/OP{1}.xls'.format(year, lookup_date)
@@ -310,14 +311,10 @@ def thermal_processer(df, logger):
     geothermal_generation = []
     unknown_generation = []
 
-    for plant in THERMAL_PLANTS.keys():
-        try:
-            plant_vals = thermal_df.loc[plant].to_dict()
-        except KeyError:
-            # plant is missing from df
-            continue
+    for plant in data_plants:
+        plant_vals = thermal_df.loc[plant].to_dict()
 
-        plant_type = THERMAL_PLANTS[plant]
+        plant_type = THERMAL_PLANTS.get(plant, 'unknown')
         if plant_type == 'coal':
             coal_generation.append(plant_vals)
         elif plant_type == 'gas':
@@ -348,7 +345,7 @@ def thermal_processer(df, logger):
     return coal, gas, oil, biomass, geothermal, unknown
 
 
-def data_processer(df, date, old_format, logger):
+def data_processer(df, date, is_old_format, logger):
     """
     Extracts aggregated data for hydro, solar and wind from dataframe.
     Combines with thermal data and an arrow object timestamp.
@@ -365,22 +362,21 @@ def data_processer(df, date, old_format, logger):
 
     total = df.loc['Total Generación SIC']
 
-    if old_format==True:
-        solar = df.loc['Solares'].to_dict()
-        wind = df.loc['Eólicas'].to_dict()
+    if is_old_format:
+        solar_vals = df.loc['Solares'].to_dict()
+        wind_vals = df.loc['Eólicas'].to_dict()
 
         hydro_running = df.loc['Pasada'].to_dict()
         hydro_dam = df.loc['Embalse'].to_dict()
         hydro_joined = defaultdict(lambda: 0.0)
-        hydro = combine_generating_units([hydro_running, hydro_dam], hydro_joined)
+        hydro_vals = combine_generating_units([hydro_running, hydro_dam], hydro_joined)
     else:
         hydro = df.loc['Hidroeléctrico'].to_dict()
         solar = df.loc['Solar'].to_dict()
         wind = df.loc['Eólico'].to_dict()
-
-    hydro_vals = {k: hydro[k]*total[k] for k in hydro}
-    solar_vals = {k: solar[k]*total[k] for k in solar}
-    wind_vals = {k: wind[k]*total[k] for k in wind}
+        hydro_vals = {k: hydro[k]*total[k] for k in hydro}
+        solar_vals = {k: solar[k]*total[k] for k in solar}
+        wind_vals = {k: wind[k]*total[k] for k in wind}
 
     generation_by_hour = []
     for hour in range(0,24):
@@ -462,5 +458,5 @@ def fetch_production(zone_key = 'CL-SIC', session=None, target_datetime=None, lo
 
     print('fetch_production() ->')
     print(fetch_production())
-    #print('fetch_production(target_datetime=2016-01-01)')
-    #print(fetch_production(target_datetime=arrow.get('2016-01-01')))
+    #print('fetch_production(target_datetime=2015-01-02)')
+    #print(fetch_production(target_datetime='2015-01-02'))