From d29d0bf4403d729077f08c38efd0a0a1e0a5c8c9 Mon Sep 17 00:00:00 2001 From: systemcatch <30196510+systemcatch@users.noreply.github.com> Date: Wed, 28 Nov 2018 17:11:54 +0000 Subject: [PATCH 1/2] Allow fetching of historical data for CL-SIC - Handle old format xls files correctly. - Read only the correct sheet from xls file. - Add geothermal category. - Avoid error if no generation in hour for fuel type. - Add new plants. --- parsers/CL_SIC.py | 140 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 97 insertions(+), 43 deletions(-) diff --git a/parsers/CL_SIC.py b/parsers/CL_SIC.py index c67e87a582..0b6d06f2a2 100644 --- a/parsers/CL_SIC.py +++ b/parsers/CL_SIC.py @@ -6,12 +6,13 @@ from bs4 import BeautifulSoup from collections import defaultdict from datetime import datetime +import logging import pandas as pd import re import requests -thermal_plants = { +THERMAL_PLANTS = { "Taltal 2 GNL": "gas", "Taltal 2": "gas", "Taltal 2 Diesel": "oil", @@ -210,36 +211,66 @@ "HBS GNL": "gas", "Rey": "oil", "El Nogal": "oil", - "Lepanto": "biomass" + "Lepanto": "biomass", + "Sta Fe": "unknown", + "Collipulli": "unknown", + "Totoral": "unknown", + "Ancali": "unknown", + "Nueva Renca": "gas", + "Laja CMPC": "biomass", + "Curanilahue": "coal", + "Cabrero": "biomass", + "Curacautin": "geothermal", + "D. Almagro": "gas", + "Concon": "gas", + "Lautaro": "biomass", + "Degan": "oil" } -def get_xls_data(session = None): +def get_xls_data(target_datetime = None, session = None): """Finds and reads .xls file from url into a pandas dataframe.""" - s = session or requests.Session() - document_url = 'https://sic.coordinador.cl/informes-y-documentos/fichas/operacion-real/' - req = s.get(document_url) - soup = BeautifulSoup(req.text, 'html.parser') - - # Find the latest file. - generation_link = soup.find("a", {"title": "Descargar archivo"}) - extension = generation_link["href"] - base_url = "https://sic.coordinador.cl" - data_url = base_url + extension - - date_pattern = r'OP(\d+)\.xls' - date_str = re.search(date_pattern, extension).group(1) - - date_no_tz = arrow.get(date_str, "YYMMDD") - date = date_no_tz.replace(tzinfo='Chile/Continental') - + if not target_datetime: + s = session or requests.Session() + document_url = 'https://sic.coordinador.cl/informes-y-documentos/fichas/operacion-real/' + req = s.get(document_url) + soup = BeautifulSoup(req.text, 'html.parser') + + # Find the latest file. + generation_link = soup.find("a", {"title": "Descargar archivo"}) + extension = generation_link["href"] + base_url = "https://sic.coordinador.cl" + data_url = base_url + extension + + date_pattern = r'OP(\d+)\.xls' + date_str = re.search(date_pattern, extension).group(1) + date_no_tz = arrow.get(date_str, "YYMMDD") + date = date_no_tz.replace(tzinfo='Chile/Continental') + else: + lookup_date = target_datetime.format('YYMMDD') + year = target_datetime.format('YY') + data_url = 'https://sic.coordinador.cl/wp-content/uploads/estadisticas/operdiar/{0}/OP{1}.xls'.format(year, lookup_date) + date = target_datetime.replace(tzinfo='Chile/Continental') + + # Multiple tables in first excel sheet, only top one is needed. col_names = ['Plants'] + list(range(1,24)) + [0] - df = pd.read_excel(data_url, skiprows=[0,1,2,3], header=None, index_col=0, skip_footer=300, usecols=25, names=col_names) + df = pd.read_excel(data_url, skiprows=[0,1,2], header=None, index_col=0, sheet_name="gen_real", usecols=25, names=col_names) df = df.reset_index(drop=True) df = df.set_index("Plants") - return df, date + OLD_FORMAT = False + # Table layout changed in January 2016, old format will cause total processing to fail. + try: + df_end = df.index.get_loc('Eólico') + except KeyError: + df_end = df.index.get_loc('Total Generación SIC') + OLD_FORMAT = True + + # Remove unneeded rows. + df = df.iloc[:df_end+1] + + return df, date, OLD_FORMAT def combine_generating_units(generation, gen_vals): @@ -266,7 +297,7 @@ def thermal_processer(df, logger): # Log any new plants that have been added. data_plants = list(thermal_df.index) - map_plants = list(thermal_plants.keys()) + map_plants = list(THERMAL_PLANTS.keys()) unmapped = list(set(data_plants) - set(map_plants)) for plant in unmapped: @@ -276,11 +307,17 @@ def thermal_processer(df, logger): gas_generation = [] oil_generation = [] biomass_generation = [] + geothermal_generation = [] unknown_generation = [] - for plant in thermal_plants.keys(): - plant_vals = thermal_df.loc[plant].to_dict() - plant_type = thermal_plants[plant] + for plant in THERMAL_PLANTS.keys(): + try: + plant_vals = thermal_df.loc[plant].to_dict() + except KeyError: + # plant is missing from df + continue + + plant_type = THERMAL_PLANTS[plant] if plant_type == 'coal': coal_generation.append(plant_vals) elif plant_type == 'gas': @@ -289,6 +326,8 @@ def thermal_processer(df, logger): oil_generation.append(plant_vals) elif plant_type == 'biomass': biomass_generation.append(plant_vals) + elif plant_type == 'geothermal': + geothermal_generation.append(plant_vals) else: unknown_generation.append(plant_vals) @@ -296,18 +335,20 @@ def thermal_processer(df, logger): gas_vals = defaultdict(lambda: 0.0) oil_vals = defaultdict(lambda: 0.0) biomass_vals = defaultdict(lambda: 0.0) + geothermal_vals = defaultdict(lambda: 0.0) unknown_vals = defaultdict(lambda: 0.0) coal = combine_generating_units(coal_generation, coal_vals) gas = combine_generating_units(gas_generation, gas_vals) oil = combine_generating_units(oil_generation, oil_vals) biomass = combine_generating_units(biomass_generation, biomass_vals) + geothermal = combine_generating_units(geothermal_generation, geothermal_vals) unknown = combine_generating_units(unknown_generation, unknown_vals) - return coal, gas, oil, biomass, unknown + return coal, gas, oil, biomass, geothermal, unknown -def data_processer(df, date, logger): +def data_processer(df, date, old_format, logger): """ Extracts aggregated data for hydro, solar and wind from dataframe. Combines with thermal data and an arrow object timestamp. @@ -319,13 +360,23 @@ def data_processer(df, date, logger): gas_vals = thermal_generation[1] oil_vals = thermal_generation[2] biomass_vals = thermal_generation[3] - unknown_vals = thermal_generation[4] + geothermal_vals = thermal_generation[4] + unknown_vals = thermal_generation[5] total = df.loc['Total Generación SIC'] - hydro = df.loc['Hidroeléctrico'].to_dict() - solar = df.loc['Solar'].to_dict() - wind = df.loc['Eólico'].to_dict() + if old_format==True: + solar = df.loc['Solares'].to_dict() + wind = df.loc['Eólicas'].to_dict() + + hydro_running = df.loc['Pasada'].to_dict() + hydro_dam = df.loc['Embalse'].to_dict() + hydro_joined = defaultdict(lambda: 0.0) + hydro = combine_generating_units([hydro_running, hydro_dam], hydro_joined) + else: + hydro = df.loc['Hidroeléctrico'].to_dict() + solar = df.loc['Solar'].to_dict() + wind = df.loc['Eólico'].to_dict() hydro_vals = {k: hydro[k]*total[k] for k in hydro} solar_vals = {k: solar[k]*total[k] for k in solar} @@ -334,14 +385,15 @@ def data_processer(df, date, logger): generation_by_hour = [] for hour in range(0,24): production = {} - production['hydro'] = hydro_vals[hour] - production['wind'] = wind_vals[hour] - production['solar'] = solar_vals[hour] - production['coal'] = coal_vals[hour] - production['gas'] = gas_vals[hour] - production['oil'] = oil_vals[hour] - production['biomass'] = biomass_vals[hour] - production['unknown'] = unknown_vals[hour] + production['hydro'] = hydro_vals.get(hour, 0.0) + production['wind'] = wind_vals.get(hour, 0.0) + production['solar'] = solar_vals.get(hour, 0.0) + production['coal'] = coal_vals.get(hour, 0.0) + production['gas'] = gas_vals.get(hour, 0.0) + production['oil'] = oil_vals.get(hour, 0.0) + production['biomass'] = biomass_vals.get(hour, 0.0) + production['geothermal'] = geothermal_vals.get(hour, 0.0) + production['unknown'] = unknown_vals.get(hour, 0.0) if hour == 0: # Midnight data is for a new day. @@ -354,7 +406,7 @@ def data_processer(df, date, logger): return generation_by_hour -def fetch_production(zone_key = 'CL-SIC', session=None, target_datetime=None, logger=None): +def fetch_production(zone_key = 'CL-SIC', session=None, target_datetime=None, logger=logging.getLogger(__name__)): """ Requests the last known production mix (in MW) of a given country Arguments: @@ -384,8 +436,8 @@ def fetch_production(zone_key = 'CL-SIC', session=None, target_datetime=None, lo } """ - gxd = get_xls_data(session = None) - processing = data_processer(gxd[0], gxd[1], logger) + gxd = get_xls_data(target_datetime = target_datetime, session = None) + processing = data_processer(gxd[0], gxd[1], gxd[2], logger) data_by_hour = [] for processed_data in processing: @@ -410,3 +462,5 @@ def fetch_production(zone_key = 'CL-SIC', session=None, target_datetime=None, lo print('fetch_production() ->') print(fetch_production()) + #print('fetch_production(target_datetime=2016-01-01)') + #print(fetch_production(target_datetime=arrow.get('2016-01-01'))) From 005f330f79421e3e73764ba6cb656e7ea9a961fa Mon Sep 17 00:00:00 2001 From: systemcatch <30196510+systemcatch@users.noreply.github.com> Date: Thu, 29 Nov 2018 10:23:45 +0000 Subject: [PATCH 2/2] Suggested changes and fix multiplication error --- parsers/CL_SIC.py | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/parsers/CL_SIC.py b/parsers/CL_SIC.py index 0b6d06f2a2..d4c2f0771a 100644 --- a/parsers/CL_SIC.py +++ b/parsers/CL_SIC.py @@ -248,6 +248,7 @@ def get_xls_data(target_datetime = None, session = None): date_no_tz = arrow.get(date_str, "YYMMDD") date = date_no_tz.replace(tzinfo='Chile/Continental') else: + target_datetime = arrow.get(target_datetime) lookup_date = target_datetime.format('YYMMDD') year = target_datetime.format('YY') data_url = 'https://sic.coordinador.cl/wp-content/uploads/estadisticas/operdiar/{0}/OP{1}.xls'.format(year, lookup_date) @@ -310,14 +311,10 @@ def thermal_processer(df, logger): geothermal_generation = [] unknown_generation = [] - for plant in THERMAL_PLANTS.keys(): - try: - plant_vals = thermal_df.loc[plant].to_dict() - except KeyError: - # plant is missing from df - continue + for plant in data_plants: + plant_vals = thermal_df.loc[plant].to_dict() - plant_type = THERMAL_PLANTS[plant] + plant_type = THERMAL_PLANTS.get(plant, 'unknown') if plant_type == 'coal': coal_generation.append(plant_vals) elif plant_type == 'gas': @@ -348,7 +345,7 @@ def thermal_processer(df, logger): return coal, gas, oil, biomass, geothermal, unknown -def data_processer(df, date, old_format, logger): +def data_processer(df, date, is_old_format, logger): """ Extracts aggregated data for hydro, solar and wind from dataframe. Combines with thermal data and an arrow object timestamp. @@ -365,22 +362,21 @@ def data_processer(df, date, old_format, logger): total = df.loc['Total Generación SIC'] - if old_format==True: - solar = df.loc['Solares'].to_dict() - wind = df.loc['Eólicas'].to_dict() + if is_old_format: + solar_vals = df.loc['Solares'].to_dict() + wind_vals = df.loc['Eólicas'].to_dict() hydro_running = df.loc['Pasada'].to_dict() hydro_dam = df.loc['Embalse'].to_dict() hydro_joined = defaultdict(lambda: 0.0) - hydro = combine_generating_units([hydro_running, hydro_dam], hydro_joined) + hydro_vals = combine_generating_units([hydro_running, hydro_dam], hydro_joined) else: hydro = df.loc['Hidroeléctrico'].to_dict() solar = df.loc['Solar'].to_dict() wind = df.loc['Eólico'].to_dict() - - hydro_vals = {k: hydro[k]*total[k] for k in hydro} - solar_vals = {k: solar[k]*total[k] for k in solar} - wind_vals = {k: wind[k]*total[k] for k in wind} + hydro_vals = {k: hydro[k]*total[k] for k in hydro} + solar_vals = {k: solar[k]*total[k] for k in solar} + wind_vals = {k: wind[k]*total[k] for k in wind} generation_by_hour = [] for hour in range(0,24): @@ -462,5 +458,5 @@ def fetch_production(zone_key = 'CL-SIC', session=None, target_datetime=None, lo print('fetch_production() ->') print(fetch_production()) - #print('fetch_production(target_datetime=2016-01-01)') - #print(fetch_production(target_datetime=arrow.get('2016-01-01'))) + #print('fetch_production(target_datetime=2015-01-02)') + #print(fetch_production(target_datetime='2015-01-02'))