Skip to content

Commit

Permalink
[#7] Add all other parsing methods
Browse files Browse the repository at this point in the history
  • Loading branch information
malparty committed Jun 16, 2021
1 parent 333e515 commit b9e3afd
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 8 deletions.
24 changes: 19 additions & 5 deletions app/services/google_service/parser_service.rb
Expand Up @@ -4,9 +4,17 @@ module GoogleService
class ParserService
require 'nokogiri'

@@non_ads_result_selector = 'a[data-ved]:not([role]):not([jsaction]):not(.adwords):not(.footer-links)'

def initialize(html)
@html = html
@document = Nokogiri::HTML.parse(html)

# Add a class to all AdWords link for easier manipulation
@document.css('div[data-text-ad] a[data-ved]').add_class('adwords')

# Mark footer links to identify them
@document.css('#footcnt a').add_class('footer-links')
end

def parse_into(keyword)
Expand All @@ -20,28 +28,34 @@ def parse_into(keyword)
end

def ads_top_count
@document.css('#tads div[data-text-ad]').count
@document.css('#tads .adwords').count
end

def ads_page_count
@document.css('div[data-text-ad]').count
@document.css('.adwords').count
end

def ads_top_url
# data-ved enables to filter "role=list" (sub links) items
@document.css('#tads div[data-text-ad] a[data-ved]').map { |a_tag| a_tag['href'] }
@document.css('#tads .adwords').map { |a_tag| a_tag['href'] }
end

def ads_page_url

@document.css('.adwords').map { |a_tag| a_tag['href'] }
end

def non_ads_result_count
@document.css(@@non_ads_result_selector).count
end

def non_ads_url
@document.css(@@non_ads_result_selector).map { |a_tag| a_tag['href'] }
end

def total_link_count

Rails.logger.info 'Counter HERE!!!'
@document.css('a').map { | a_tag | Rails.logger.info a_tag['href'] }
@document.css('a').count
end
end
end
32 changes: 29 additions & 3 deletions spec/services/google_service/parser_service_spec.rb
Expand Up @@ -14,7 +14,7 @@
end
end

context 'when parsing a page having 3 top ads and 3 bottom ads' do
context 'when parsing a page having 3 top ads, 3 bottom ads and 14 non ads links' do
it 'counts exactly 3 top ads' do
result = nil
VCR.use_cassette('google_search_top_ads_6') do
Expand All @@ -33,14 +33,40 @@
expect(described_class.new(result).ads_page_count).to eq(6)
end

it 'find 3 different top ads url' do
it 'finds exactly the 3 top ads urls' do
result = nil
VCR.use_cassette('google_search_top_ads_6') do
result = GoogleService::ClientService.query('vpn')
end

expect(described_class.new(result).ads_top_url).to contain_exactly('https://cloud.google.com/free','https://www.expressvpn.com/', 'https://www.top10vpn.com/best-vpn-for-vietnam/')
expect(described_class.new(result).ads_top_url).to contain_exactly('https://cloud.google.com/free', 'https://www.expressvpn.com/', 'https://www.top10vpn.com/best-vpn-for-vietnam/')
end

it 'counts exactly 14 non ads results' do
result = nil
VCR.use_cassette('google_search_top_ads_6') do
result = GoogleService::ClientService.query('vpn')
end

expect(described_class.new(result).non_ads_result_count).to eq(14)
end

it 'gets 14 results' do
result = nil
VCR.use_cassette('google_search_top_ads_6') do
result = GoogleService::ClientService.query('vpn')
end

expect(described_class.new(result).non_ads_url.count).to eq(14)
end

it 'gets exactly 113 links' do # Counted from cassette html raw code
result = nil
VCR.use_cassette('google_search_top_ads_6') do
result = GoogleService::ClientService.query('vpn')
end

expect(described_class.new(result).total_link_count).to eq(113)
end
end
end

0 comments on commit b9e3afd

Please sign in to comment.