/
login.py
144 lines (96 loc) · 4.04 KB
/
login.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from bs4 import BeautifulSoup as bs4
from requests import Session
from lxml import html
#import downloader as dw ############
import requests
from dotenv import load_dotenv
import mechanize as mc
import os
load_dotenv()
class App:
browser = mc.Browser()
def login(self, login : str, senha: str) -> bytes:
self.browser.open("https://www.goodreads.com/user/sign_in")
self.browser.select_form(nr = 0)
self.browser.form['user[email]'] = login
self.browser.form['user[password]'] = senha
r = self.browser.submit()
return r
def getBrowser(self) -> mc.Browser:
return self.browser
def getLists(browser: mc.Browser) -> None:
res = browser.open("https://www.goodreads.com/rating/voters/173429036?resource_type=Review")
aux = res.read()
html2 = bs4(aux, 'html.parser')
with open("testeLists.html", "w", encoding='utf-8') as file2:
file2.write( str( html2 ) )
def getData(browser: mc.Browser) -> None:
res = browser.open("https://www.goodreads.com/book/show/515601.The_C_Programming_Language")
aux = res.read()
html2 = bs4(aux, 'html.parser')
with open("metaData.html", "w", encoding='utf-8') as file2:
file2.write(str(html2))
'''
TODO: refatorar
'''
def getGenders(browser : mc.Browser, url: str, name: str) -> None:
res = browser.open(url)
aux = res.read()
html2 = bs4(aux, 'html.parser')
with open(name, "w", encoding='utf-8') as file2:
file2.write( str( html2 ) )
def searchGenders( browser : mc.Browser, gender : str , page = 1) -> None:
url = "https://www.goodreads.com/shelf/show/"+str(gender)+"?page="+str(page)
res = browser.open(url)
html = res.read()
html = bs4( html, "html.parser" )
pageCount = html.select("div[max_num_pages]")
pageCount = pageCount[0].select(":not(:last-child)")
maxPage = int(pageCount[ len(pageCount) -1 ].get_text())
linkdata = open("links.html", "a")
for i in range(maxPage + 1):
if( i >= 1 ):
print("===> getting page {}\n".format(i))
url = "https://www.goodreads.com/shelf/show/"+str(gender)+"?page="+str(i)
res = browser.open(url)
html = res.read()
html = bs4( html, "html.parser" )
booklinks = html.find_all('a', {'class' : 'bookTitle'})
for link in booklinks:
linkdata.write( "<a href='"+ str(url) + str( link['href'] ) +"' ></a>\n")
print("Ready!")
app = App()
app.login( os.getenv("EMAIL"), os.getenv("SENHA") )
br = app.getBrowser()
query = "programming-language"
searchGenders(br, query)
getLists(br)
getData(br)
with open("testeLists.html", "r", encoding='utf8') as file:
contents = file.read()
bsObj = bs4(contents, "lxml")
aux = open("list.txt", "w", encoding='utf8')
officials = bsObj.find_all('div', {'class' : 'leftContainer'})
for text in officials:
aux.write(text.get_text().format())
print("Get List")
with open ("metaData.html", "r", encoding='utf-8') as file:
contents = file.read()
bsObj = bs4(contents, "lxml")
dt = open("data.txt", "w", encoding='utf-8')
# last_links = bs4.find(class_='infoBoxRowItem')
# last_links.decompose()
print("getting data")
data = bsObj.find_all('div',{'class' : 'row'})
details = bsObj.find_all('div', {'class' : 'infoBoxRowTitle'})
language = bsObj.find_all('div', {'itemprop' : 'inLanguage'})
name = bsObj.find_all('h1', {'id' : 'bookTitle'})
author = bsObj.find_all('span', {'itemprop' : 'name'})
sinopse = bsObj.find_all('span', {'id' : 'freeTextContainer6051659753480502809'})
sinopse2 = bsObj.find_all('span', {'id' : 'freeText6051659753480502809'})
arr = data + details + language + name + author + sinopse + sinopse2
for text in arr:
dt.write(text.get_text().format())
print("Finalized!")
aux.close()
file.close()