Skip to content

Commit

Permalink
Create corrected_wrong
Browse files Browse the repository at this point in the history
This is the exact error message I am receiving in my terminal when running the code "/home/project/banks_project.py:3: DeprecationWarning: 
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at pandas-dev/pandas#54466
        
  import pandas as pd
Traceback (most recent call last):
  File "/home/project/banks_project.py", line 55, in <module>
    df = extract(url, table_attribs)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/project/banks_project.py", line 30, in extract
    bank_name = col[1].find_all('a')[1]['title']
                ~~~^^^
IndexError: list index out of range"
  • Loading branch information
HaleStorme committed Feb 21, 2024
1 parent 77c967a commit 4567893
Showing 1 changed file with 70 additions and 0 deletions.
70 changes: 70 additions & 0 deletions corrected_wrong
@@ -0,0 +1,70 @@
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import sqlite3
from datetime import datetime

url = 'https://web.archive.org/web/20230908091635/https://en.wikipedia.org/wiki/List_of_largest_banks'
table_attribs = ["Name", "MC_USD_Billion"]
db_name = 'Banks.db'
table_name = 'Largest_banks'
csv_path = './Largest_banks_data.csv'

def log_progress(message):
timestamp_format = '%Y-%h-%d-%H:%M:%S' # Year-Monthname-Day-Hour-Minute-Second
now = datetime.now() # get current timestamp
timestamp = now.strftime(timestamp_format)
with open("./code_log.txt","a") as f:
f.write(timestamp + ' : ' + message + '\n')

def extract(url, table_attribs):
page = requests.get(url).text
soup = BeautifulSoup(page,'html.parser')
df = pd.DataFrame(columns=table_attribs)
tables = soup.find_all('tbody')
rows = tables[0].find_all('tr')
for row in rows:
if row.find_all('td') is not None:
col = row.find_all('td')
bank_name = col[1].find_all('a')[1]['title']
market_cap = col[2].contents[0][:-1]
data_dict = {"Name": bank_name, "MC_USD_Billion": float(market_cap)}
df1 = pd.DataFrame(data_dict, index =[0])
df = pd.concat([df,df1], ignore_index = True)
return df

def transform(df, csv_path):
exchange_rate = pd.read_csv(csv_path)
exchange_rate = exchange_rate.set_index('Currency').to_dict()['Rate']
df['MC_GBP_Billion']= [np.round(x*exchange_rate['GBP'],2) for x in df['MC_USD_Billion']]
df['MC_EUR_Billion']= [np.round(x*exchange_rate['EUR'],2) for x in df['MC_USD_Billion']]
df['MC_INR_Billion']= [np.round(x*exchange_rate['INR'],2) for x in df['MC_USD_Billion']]
return df

def load_to_csv(df, output_path):
df.to_csv(csv_path)

def load_to_db(df, sql_connection, table_name):
df.to_sql(table_name, sql_connection, if_exists='replace', index=False)

def run_query(query_statement, sql_connection):
print(query_statement)
query_output = pd.read_sql(query_statement, sql_connection)
print(query_output)

log_progress('Preliminaries complete. Initiating ETL process')
df = extract(url, table_attribs)
log_progress('Data extraction complete. Initiating Transformation process')
df = transform(df,'exchange_rate.csv')
log_progress('Data transformation complete. Initiating loading process')
load_to_csv(df, csv_path)
log_progress('Data saved to CSV file')
sql_connection = sqlite3.connect('World_Economies.db')
log_progress('SQL Connection initiated.')
load_to_db(df, sql_connection, table_name)
log_progress('Data loaded to Database as table. Running the query')
query_statement = f"SELECT * from {table_name} WHERE MC_USD_Billion >= 100"
run_query(query_statement, sql_connection)
log_progress('Process Complete.')
sql_connection.close()

0 comments on commit 4567893

Please sign in to comment.