Skip to content

Commit

Permalink
Add save_images, save_dir, save_images_dir parameters to CLI
Browse files Browse the repository at this point in the history
Remove deprecated find_element_by_* and find_elements_by_* (SeleniumHQ/selenium#10712)
Save images using original filename
  • Loading branch information
garoxas committed Dec 11, 2022
1 parent 76e7086 commit e40efec
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 14 deletions.
14 changes: 11 additions & 3 deletions Scweet/scweet.py
Expand Up @@ -12,7 +12,7 @@

def scrape(since, until=None, words=None, to_account=None, from_account=None, mention_account=None, interval=5, lang=None,
headless=True, limit=float("inf"), display_type="Top", resume=False, proxy=None, hashtag=None,
show_images=False, save_images=False, save_dir="outputs", filter_replies=False, proximity=False,
show_images=False, save_images=False, save_dir="outputs", save_images_dir="images", filter_replies=False, proximity=False,
geocode=None, minreplies=None, minlikes=None, minretweets=None):
"""
scrape data from twitter using requests, starting from <since> until <until>. The program make a search between each <since> and <until_local>
Expand Down Expand Up @@ -131,7 +131,6 @@ def scrape(since, until=None, words=None, to_account=None, from_account=None, me
# save images
if save_images==True:
print("Saving images ...")
save_images_dir = "images"
if not os.path.exists(save_images_dir):
os.makedirs(save_images_dir)

Expand Down Expand Up @@ -184,6 +183,12 @@ def scrape(since, until=None, words=None, to_account=None, from_account=None, me
help='Min. number of likes to the tweet', default=None)
parser.add_argument('--minretweets', type=int,
help='Min. number of retweets to the tweet', default=None)
parser.add_argument('--save_images', type=bool,
help='Save images. True or False', default=False)
parser.add_argument('--save_dir', type=str,
help='Save dir', default="outputs")
parser.add_argument('--save_images_dir', type=str,
help='Save images dir', default="images")


args = parser.parse_args()
Expand All @@ -207,8 +212,11 @@ def scrape(since, until=None, words=None, to_account=None, from_account=None, me
minreplies = args.minreplies
minlikes = args.minlikes
minretweets = args.minlikes
save_images = args.save_images
save_dir = args.save_dir
save_images_dir = args.save_images_dir

data = scrape(since=since, until=until, words=words, to_account=to_account, from_account=from_account, mention_account=mention_account,
hashtag=hashtag, interval=interval, lang=lang, headless=headless, limit=limit,
display_type=display_type, resume=resume, proxy=proxy, filter_replies=False, proximity=proximity,
display_type=display_type, resume=resume, proxy=proxy, save_images=save_images, save_dir=save_dir, save_images_dir=save_images_dir, filter_replies=False, proximity=proximity,
geocode=geocode, minreplies=minreplies, minlikes=minlikes, minretweets=minretweets)
20 changes: 10 additions & 10 deletions Scweet/user.py
Expand Up @@ -18,40 +18,40 @@ def get_user_information(users, driver=None, headless=True):
if user is not None:

try:
following = driver.find_element_by_xpath(
following = driver.find_element(by=By.XPATH, value=
'//a[contains(@href,"/following")]/span[1]/span[1]').text
followers = driver.find_element_by_xpath(
followers = driver.find_element(by=By.XPATH, value=
'//a[contains(@href,"/followers")]/span[1]/span[1]').text
except Exception as e:
# print(e)
return

try:
element = driver.find_element_by_xpath('//div[contains(@data-testid,"UserProfileHeader_Items")]//a[1]')
element = driver.find_element(by=By.XPATH, value='//div[contains(@data-testid,"UserProfileHeader_Items")]//a[1]')
website = element.get_attribute("href")
except Exception as e:
# print(e)
website = ""

try:
desc = driver.find_element_by_xpath('//div[contains(@data-testid,"UserDescription")]').text
desc = driver.find_element(by=By.XPATH, value='//div[contains(@data-testid,"UserDescription")]').text
except Exception as e:
# print(e)
desc = ""
a = 0
try:
join_date = driver.find_element_by_xpath(
join_date = driver.find_element(by=By.XPATH, value=
'//div[contains(@data-testid,"UserProfileHeader_Items")]/span[3]').text
birthday = driver.find_element_by_xpath(
birthday = driver.find_element(by=By.XPATH, value=
'//div[contains(@data-testid,"UserProfileHeader_Items")]/span[2]').text
location = driver.find_element_by_xpath(
location = driver.find_element(by=By.XPATH, value=
'//div[contains(@data-testid,"UserProfileHeader_Items")]/span[1]').text
except Exception as e:
# print(e)
try:
join_date = driver.find_element_by_xpath(
join_date = driver.find_element(by=By.XPATH, value=
'//div[contains(@data-testid,"UserProfileHeader_Items")]/span[2]').text
span1 = driver.find_element_by_xpath(
span1 = driver.find_element(by=By.XPATH, value=
'//div[contains(@data-testid,"UserProfileHeader_Items")]/span[1]').text
if hasNumbers(span1):
birthday = span1
Expand All @@ -62,7 +62,7 @@ def get_user_information(users, driver=None, headless=True):
except Exception as e:
# print(e)
try:
join_date = driver.find_element_by_xpath(
join_date = driver.find_element(by=By.XPATH, value=
'//div[contains(@data-testid,"UserProfileHeader_Items")]/span[1]').text
birthday = ""
location = ""
Expand Down
4 changes: 3 additions & 1 deletion Scweet/utils.py
Expand Up @@ -421,4 +421,6 @@ def check_exists_by_xpath(xpath, driver):
def dowload_images(urls, save_dir):
for i, url_v in enumerate(urls):
for j, url in enumerate(url_v):
urllib.request.urlretrieve(url, save_dir + '/' + str(i + 1) + '_' + str(j + 1) + ".jpg")
match = re.search('/pbs.twimg.com/media/([^?]+)\?.*format=([a-z]+)', url)
if match:
urllib.request.urlretrieve(re.sub(r'name=[a-z0-9]+', 'name=orig', url), save_dir + '/' + match.group(1) + "." + match.group(2))

0 comments on commit e40efec

Please sign in to comment.