New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Allow fetching of historical data for CL-SIC #1692
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
|
@@ -6,12 +6,13 @@ | |||||||
from bs4 import BeautifulSoup | ||||||||
from collections import defaultdict | ||||||||
from datetime import datetime | ||||||||
import logging | ||||||||
import pandas as pd | ||||||||
import re | ||||||||
import requests | ||||||||
|
||||||||
|
||||||||
thermal_plants = { | ||||||||
THERMAL_PLANTS = { | ||||||||
"Taltal 2 GNL": "gas", | ||||||||
"Taltal 2": "gas", | ||||||||
"Taltal 2 Diesel": "oil", | ||||||||
|
@@ -210,36 +211,66 @@ | |||||||
"HBS GNL": "gas", | ||||||||
"Rey": "oil", | ||||||||
"El Nogal": "oil", | ||||||||
"Lepanto": "biomass" | ||||||||
"Lepanto": "biomass", | ||||||||
"Sta Fe": "unknown", | ||||||||
"Collipulli": "unknown", | ||||||||
"Totoral": "unknown", | ||||||||
"Ancali": "unknown", | ||||||||
"Nueva Renca": "gas", | ||||||||
"Laja CMPC": "biomass", | ||||||||
"Curanilahue": "coal", | ||||||||
"Cabrero": "biomass", | ||||||||
"Curacautin": "geothermal", | ||||||||
"D. Almagro": "gas", | ||||||||
"Concon": "gas", | ||||||||
"Lautaro": "biomass", | ||||||||
"Degan": "oil" | ||||||||
} | ||||||||
|
||||||||
|
||||||||
def get_xls_data(session = None): | ||||||||
def get_xls_data(target_datetime = None, session = None): | ||||||||
"""Finds and reads .xls file from url into a pandas dataframe.""" | ||||||||
|
||||||||
s = session or requests.Session() | ||||||||
document_url = 'https://sic.coordinador.cl/informes-y-documentos/fichas/operacion-real/' | ||||||||
req = s.get(document_url) | ||||||||
soup = BeautifulSoup(req.text, 'html.parser') | ||||||||
|
||||||||
# Find the latest file. | ||||||||
generation_link = soup.find("a", {"title": "Descargar archivo"}) | ||||||||
extension = generation_link["href"] | ||||||||
base_url = "https://sic.coordinador.cl" | ||||||||
data_url = base_url + extension | ||||||||
|
||||||||
date_pattern = r'OP(\d+)\.xls' | ||||||||
date_str = re.search(date_pattern, extension).group(1) | ||||||||
|
||||||||
date_no_tz = arrow.get(date_str, "YYMMDD") | ||||||||
date = date_no_tz.replace(tzinfo='Chile/Continental') | ||||||||
|
||||||||
if not target_datetime: | ||||||||
s = session or requests.Session() | ||||||||
document_url = 'https://sic.coordinador.cl/informes-y-documentos/fichas/operacion-real/' | ||||||||
req = s.get(document_url) | ||||||||
soup = BeautifulSoup(req.text, 'html.parser') | ||||||||
|
||||||||
# Find the latest file. | ||||||||
generation_link = soup.find("a", {"title": "Descargar archivo"}) | ||||||||
extension = generation_link["href"] | ||||||||
base_url = "https://sic.coordinador.cl" | ||||||||
data_url = base_url + extension | ||||||||
|
||||||||
date_pattern = r'OP(\d+)\.xls' | ||||||||
date_str = re.search(date_pattern, extension).group(1) | ||||||||
date_no_tz = arrow.get(date_str, "YYMMDD") | ||||||||
date = date_no_tz.replace(tzinfo='Chile/Continental') | ||||||||
else: | ||||||||
lookup_date = target_datetime.format('YYMMDD') | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
That should make the historical fetching option compatible with test_parser.py & our backend |
||||||||
year = target_datetime.format('YY') | ||||||||
data_url = 'https://sic.coordinador.cl/wp-content/uploads/estadisticas/operdiar/{0}/OP{1}.xls'.format(year, lookup_date) | ||||||||
date = target_datetime.replace(tzinfo='Chile/Continental') | ||||||||
|
||||||||
# Multiple tables in first excel sheet, only top one is needed. | ||||||||
col_names = ['Plants'] + list(range(1,24)) + [0] | ||||||||
df = pd.read_excel(data_url, skiprows=[0,1,2,3], header=None, index_col=0, skip_footer=300, usecols=25, names=col_names) | ||||||||
df = pd.read_excel(data_url, skiprows=[0,1,2], header=None, index_col=0, sheet_name="gen_real", usecols=25, names=col_names) | ||||||||
df = df.reset_index(drop=True) | ||||||||
df = df.set_index("Plants") | ||||||||
|
||||||||
return df, date | ||||||||
OLD_FORMAT = False | ||||||||
# Table layout changed in January 2016, old format will cause total processing to fail. | ||||||||
try: | ||||||||
df_end = df.index.get_loc('Eólico') | ||||||||
except KeyError: | ||||||||
df_end = df.index.get_loc('Total Generación SIC') | ||||||||
OLD_FORMAT = True | ||||||||
|
||||||||
# Remove unneeded rows. | ||||||||
df = df.iloc[:df_end+1] | ||||||||
|
||||||||
return df, date, OLD_FORMAT | ||||||||
|
||||||||
|
||||||||
def combine_generating_units(generation, gen_vals): | ||||||||
|
@@ -266,7 +297,7 @@ def thermal_processer(df, logger): | |||||||
|
||||||||
# Log any new plants that have been added. | ||||||||
data_plants = list(thermal_df.index) | ||||||||
map_plants = list(thermal_plants.keys()) | ||||||||
map_plants = list(THERMAL_PLANTS.keys()) | ||||||||
unmapped = list(set(data_plants) - set(map_plants)) | ||||||||
|
||||||||
for plant in unmapped: | ||||||||
|
@@ -276,11 +307,17 @@ def thermal_processer(df, logger): | |||||||
gas_generation = [] | ||||||||
oil_generation = [] | ||||||||
biomass_generation = [] | ||||||||
geothermal_generation = [] | ||||||||
unknown_generation = [] | ||||||||
|
||||||||
for plant in thermal_plants.keys(): | ||||||||
plant_vals = thermal_df.loc[plant].to_dict() | ||||||||
plant_type = thermal_plants[plant] | ||||||||
for plant in THERMAL_PLANTS.keys(): | ||||||||
try: | ||||||||
plant_vals = thermal_df.loc[plant].to_dict() | ||||||||
except KeyError: | ||||||||
# plant is missing from df | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe log a warning here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Might be better to loop over the plants in the df rather than the dict mapping here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. indeed. Can you add a todo, and add a log here (just in case) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hopefully my changes are ok, all unmapped plants are being sent to the logger. |
||||||||
continue | ||||||||
|
||||||||
plant_type = THERMAL_PLANTS[plant] | ||||||||
if plant_type == 'coal': | ||||||||
coal_generation.append(plant_vals) | ||||||||
elif plant_type == 'gas': | ||||||||
|
@@ -289,25 +326,29 @@ def thermal_processer(df, logger): | |||||||
oil_generation.append(plant_vals) | ||||||||
elif plant_type == 'biomass': | ||||||||
biomass_generation.append(plant_vals) | ||||||||
elif plant_type == 'geothermal': | ||||||||
geothermal_generation.append(plant_vals) | ||||||||
else: | ||||||||
unknown_generation.append(plant_vals) | ||||||||
|
||||||||
coal_vals = defaultdict(lambda: 0.0) | ||||||||
gas_vals = defaultdict(lambda: 0.0) | ||||||||
oil_vals = defaultdict(lambda: 0.0) | ||||||||
biomass_vals = defaultdict(lambda: 0.0) | ||||||||
geothermal_vals = defaultdict(lambda: 0.0) | ||||||||
unknown_vals = defaultdict(lambda: 0.0) | ||||||||
|
||||||||
coal = combine_generating_units(coal_generation, coal_vals) | ||||||||
gas = combine_generating_units(gas_generation, gas_vals) | ||||||||
oil = combine_generating_units(oil_generation, oil_vals) | ||||||||
biomass = combine_generating_units(biomass_generation, biomass_vals) | ||||||||
geothermal = combine_generating_units(geothermal_generation, geothermal_vals) | ||||||||
unknown = combine_generating_units(unknown_generation, unknown_vals) | ||||||||
|
||||||||
return coal, gas, oil, biomass, unknown | ||||||||
return coal, gas, oil, biomass, geothermal, unknown | ||||||||
|
||||||||
|
||||||||
def data_processer(df, date, logger): | ||||||||
def data_processer(df, date, old_format, logger): | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. for clarity you could use |
||||||||
""" | ||||||||
Extracts aggregated data for hydro, solar and wind from dataframe. | ||||||||
Combines with thermal data and an arrow object timestamp. | ||||||||
|
@@ -319,13 +360,23 @@ def data_processer(df, date, logger): | |||||||
gas_vals = thermal_generation[1] | ||||||||
oil_vals = thermal_generation[2] | ||||||||
biomass_vals = thermal_generation[3] | ||||||||
unknown_vals = thermal_generation[4] | ||||||||
geothermal_vals = thermal_generation[4] | ||||||||
unknown_vals = thermal_generation[5] | ||||||||
|
||||||||
total = df.loc['Total Generación SIC'] | ||||||||
|
||||||||
hydro = df.loc['Hidroeléctrico'].to_dict() | ||||||||
solar = df.loc['Solar'].to_dict() | ||||||||
wind = df.loc['Eólico'].to_dict() | ||||||||
if old_format==True: | ||||||||
solar = df.loc['Solares'].to_dict() | ||||||||
wind = df.loc['Eólicas'].to_dict() | ||||||||
|
||||||||
hydro_running = df.loc['Pasada'].to_dict() | ||||||||
hydro_dam = df.loc['Embalse'].to_dict() | ||||||||
hydro_joined = defaultdict(lambda: 0.0) | ||||||||
hydro = combine_generating_units([hydro_running, hydro_dam], hydro_joined) | ||||||||
else: | ||||||||
hydro = df.loc['Hidroeléctrico'].to_dict() | ||||||||
solar = df.loc['Solar'].to_dict() | ||||||||
wind = df.loc['Eólico'].to_dict() | ||||||||
|
||||||||
hydro_vals = {k: hydro[k]*total[k] for k in hydro} | ||||||||
solar_vals = {k: solar[k]*total[k] for k in solar} | ||||||||
|
@@ -334,14 +385,15 @@ def data_processer(df, date, logger): | |||||||
generation_by_hour = [] | ||||||||
for hour in range(0,24): | ||||||||
production = {} | ||||||||
production['hydro'] = hydro_vals[hour] | ||||||||
production['wind'] = wind_vals[hour] | ||||||||
production['solar'] = solar_vals[hour] | ||||||||
production['coal'] = coal_vals[hour] | ||||||||
production['gas'] = gas_vals[hour] | ||||||||
production['oil'] = oil_vals[hour] | ||||||||
production['biomass'] = biomass_vals[hour] | ||||||||
production['unknown'] = unknown_vals[hour] | ||||||||
production['hydro'] = hydro_vals.get(hour, 0.0) | ||||||||
production['wind'] = wind_vals.get(hour, 0.0) | ||||||||
production['solar'] = solar_vals.get(hour, 0.0) | ||||||||
production['coal'] = coal_vals.get(hour, 0.0) | ||||||||
production['gas'] = gas_vals.get(hour, 0.0) | ||||||||
production['oil'] = oil_vals.get(hour, 0.0) | ||||||||
production['biomass'] = biomass_vals.get(hour, 0.0) | ||||||||
production['geothermal'] = geothermal_vals.get(hour, 0.0) | ||||||||
production['unknown'] = unknown_vals.get(hour, 0.0) | ||||||||
|
||||||||
if hour == 0: | ||||||||
# Midnight data is for a new day. | ||||||||
|
@@ -354,7 +406,7 @@ def data_processer(df, date, logger): | |||||||
return generation_by_hour | ||||||||
|
||||||||
|
||||||||
def fetch_production(zone_key = 'CL-SIC', session=None, target_datetime=None, logger=None): | ||||||||
def fetch_production(zone_key = 'CL-SIC', session=None, target_datetime=None, logger=logging.getLogger(__name__)): | ||||||||
""" | ||||||||
Requests the last known production mix (in MW) of a given country | ||||||||
Arguments: | ||||||||
|
@@ -384,8 +436,8 @@ def fetch_production(zone_key = 'CL-SIC', session=None, target_datetime=None, lo | |||||||
} | ||||||||
""" | ||||||||
|
||||||||
gxd = get_xls_data(session = None) | ||||||||
processing = data_processer(gxd[0], gxd[1], logger) | ||||||||
gxd = get_xls_data(target_datetime = target_datetime, session = None) | ||||||||
processing = data_processer(gxd[0], gxd[1], gxd[2], logger) | ||||||||
|
||||||||
data_by_hour = [] | ||||||||
for processed_data in processing: | ||||||||
|
@@ -410,3 +462,5 @@ def fetch_production(zone_key = 'CL-SIC', session=None, target_datetime=None, lo | |||||||
|
||||||||
print('fetch_production() ->') | ||||||||
print(fetch_production()) | ||||||||
#print('fetch_production(target_datetime=2016-01-01)') | ||||||||
#print(fetch_production(target_datetime=arrow.get('2016-01-01'))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In the eventuality that their website is not maintained anymore and the html tag is never udpated, one way to find the latest xls file is to call directly the excel URL "coordinador.cl/wp-content/uploads/estadisticas/operdiar/18/OP181127.xls" with the latest date returning a "status 200" request starting from date = now
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's more a nice-to-have than a must have, but, i'm afraid that SIC is not maintaining this webpage anymore in favor of a newest one, so I'd bet the html tag will stay old
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes right now you can use target datetime to get files after 6th November, we can change this if the website stops updating permanently.