Skip to content

Commit

Permalink
Merge pull request #39 from malparty/feature/google-search-parsed
Browse files Browse the repository at this point in the history
[#7] [Backend] As a User, I can query a single keyword and get its Google search results parsed
  • Loading branch information
malparty committed Jul 1, 2021
2 parents 5a3282a + 41d05a6 commit f5fbcf6
Show file tree
Hide file tree
Showing 10 changed files with 3,923 additions and 19 deletions.
1 change: 1 addition & 0 deletions Gemfile
Expand Up @@ -15,6 +15,7 @@ gem 'bootsnap', require: false # Reduces boot times through caching; required in
gem 'i18n-js', '3.5.1' # A library to provide the I18n translations on the Javascript
gem 'jsonapi-serializer' # A fast JSON:API serializer for Ruby Objects.
gem 'httparty' # A HTTP client for Ruby.
gem 'nokogiri' # Nokogiri makes it easy and painless to work with XML and HTML from Ruby

# Authentications & Authorizations
gem 'devise' # Authentication solution for Rails with Warden
Expand Down
1 change: 1 addition & 0 deletions Gemfile.lock
Expand Up @@ -506,6 +506,7 @@ DEPENDENCIES
letter_opener
listen (= 3.1.5)
mini_magick
nokogiri
pagy
pg
pry-byebug
Expand Down
69 changes: 69 additions & 0 deletions app/services/google/parser_service.rb
@@ -0,0 +1,69 @@
# frozen_string_literal: true

module Google
class ParserService
NON_ADS_RESULT_SELECTOR = 'a[data-ved]:not([role]):not([jsaction]):not(.adwords):not(.footer-links)'
AD_CONTAINER_ID = 'tads'
ADWORDS_CLASS = 'adwords'

def initialize(html_response:)
raise ArgumentError, 'response.body cannot be blank' if html_response.body.blank?

@html = html_response

@document = Nokogiri::HTML.parse(html_response)

# Add a class to all AdWords link for easier manipulation
document.css('div[data-text-ad] a[data-ved]').add_class(ADWORDS_CLASS)

# Mark footer links to identify them
document.css('#footcnt a').add_class('footer-links')
end

# Parse html data and return a hash with the results
def call
{
ads_top_count: ads_top_count,
ads_page_count: ads_page_count,
ads_top_url: ads_top_url,
ads_page_url: ads_page_url,
non_ads_result_count: non_ads_result_count,
non_ads_url: non_ads_url,
total_link_count: total_link_count,
html: html
}
end

private

attr_reader :html, :document

def ads_top_count
document.css("##{AD_CONTAINER_ID} .#{ADWORDS_CLASS}").count
end

def ads_page_count
document.css(".#{ADWORDS_CLASS}").count
end

def ads_top_url
document.css("##{AD_CONTAINER_ID} .#{ADWORDS_CLASS}").map { |a_tag| a_tag['href'] }
end

def ads_page_url
document.css(".#{ADWORDS_CLASS}").map { |a_tag| a_tag['href'] }
end

def non_ads_result_count
document.css(NON_ADS_RESULT_SELECTOR).count
end

def non_ads_url
document.css(NON_ADS_RESULT_SELECTOR).map { |a_tag| a_tag['href'] }
end

def total_link_count
document.css('a').count
end
end
end
File renamed without changes.
File renamed without changes.
3,430 changes: 3,430 additions & 0 deletions spec/fixtures/vcr/google_search/top_ads_1.yml

Large diffs are not rendered by default.

343 changes: 343 additions & 0 deletions spec/fixtures/vcr/google_search/top_ads_6.yml

Large diffs are not rendered by default.

40 changes: 21 additions & 19 deletions spec/services/google/client_service_spec.rb
Expand Up @@ -3,34 +3,36 @@
require 'rails_helper'

RSpec.describe Google::ClientService, type: :service do
context 'when querying a simple keyword' do
it 'returns an HTTParty Response', vcr: 'google_search' do
result = described_class.new(keyword: FFaker::Lorem.word).call
describe '#call' do
context 'when querying a simple keyword' do
it 'returns an HTTParty Response', vcr: 'google_search/base' do
result = described_class.new(keyword: FFaker::Lorem.word).call

expect(result).to be_an_instance_of(HTTParty::Response)
end
expect(result).to be_an_instance_of(HTTParty::Response)
end

it 'queries Google Search', vcr: 'google_search' do
path = described_class.new(keyword: FFaker::Lorem.word).call.request.path
it 'queries Google Search', vcr: 'google_search/base' do
path = described_class.new(keyword: FFaker::Lorem.word).call.request.path

expect(path.to_s).to start_with(described_class::BASE_SEARCH_URL)
expect(path.to_s).to start_with(described_class::BASE_SEARCH_URL)
end
end
end

context 'when google returns an HTTP error' do
it 'returns false', vcr: 'google_warn' do
result = described_class.new(keyword: FFaker::Lorem.word).call
context 'when google returns an HTTP error' do
it 'returns false', vcr: 'google_search/too_many_requests' do
result = described_class.new(keyword: FFaker::Lorem.word).call

expect(result).to eq(false)
end
expect(result).to eq(false)
end

it 'logs a warning with the escaped keyword', vcr: 'google_warn' do
allow(Rails.logger).to receive(:warn)
it 'logs a warning with the escaped keyword', vcr: 'google_search/too_many_requests' do
allow(Rails.logger).to receive(:warn)

word = FFaker::Lorem.word
described_class.new(keyword: word).call
word = FFaker::Lorem.word
described_class.new(keyword: word).call

expect(Rails.logger).to have_received(:warn).with(/#{CGI.escape(word)}/)
expect(Rails.logger).to have_received(:warn).with(/#{CGI.escape(word)}/)
end
end
end
end
54 changes: 54 additions & 0 deletions spec/services/google/parser_service_spec.rb
@@ -0,0 +1,54 @@
# frozen_string_literal: true

require 'rails_helper'

RSpec.describe Google::ParserService, type: :service do
describe '#call' do
context 'when parsing a page having 1 top ad' do
it 'counts exactly 1 top ad', vcr: 'google_search/top_ads_1' do
result = Google::ClientService.new(keyword: 'squarespace').call

expect(described_class.new(html_response: result).call[:ads_top_count]).to eq(1)
end
end

context 'when parsing a page having 3 top ads, 3 bottom ads and 14 non ad links' do
it 'counts exactly 3 top ads', vcr: 'google_search/top_ads_6' do
result = Google::ClientService.new(keyword: 'vpn').call

expect(described_class.new(html_response: result).call[:ads_top_count]).to eq(3)
end

it 'counts exactly 6 ads in total', vcr: 'google_search/top_ads_6' do
result = Google::ClientService.new(keyword: 'vpn').call

expect(described_class.new(html_response: result).call[:ads_page_count]).to eq(6)
end

it 'finds exactly the 3 top ads urls', vcr: 'google_search/top_ads_6' do
result = Google::ClientService.new(keyword: 'vpn').call

expect(described_class.new(html_response: result).call[:ads_top_url]).to contain_exactly('https://cloud.google.com/free', 'https://www.expressvpn.com/', 'https://www.top10vpn.com/best-vpn-for-vietnam/')
end

it 'counts exactly 14 non ad results', vcr: 'google_search/top_ads_6' do
result = Google::ClientService.new(keyword: 'vpn').call

expect(described_class.new(html_response: result).call[:non_ads_result_count]).to eq(14)
end

it 'gets 14 results', vcr: 'google_search/top_ads_6' do
result = Google::ClientService.new(keyword: 'vpn').call

expect(described_class.new(html_response: result).call[:non_ads_url].count).to eq(14)
end

it 'gets exactly 113 links', vcr: 'google_search/top_ads_6' do
# Counted from cassette html raw code
result = Google::ClientService.new(keyword: 'vpn').call

expect(described_class.new(html_response: result).call[:total_link_count]).to eq(113)
end
end
end
end
4 changes: 4 additions & 0 deletions spec/support/vcr.rb
Expand Up @@ -10,6 +10,10 @@
c.ignore_request do |request|
URI(request.uri).port == 9200
end
# Uncomment when need to record a cassette with readable Html
# c.before_record do |i|
# i.response.body.force_encoding('UTF-8')
# end
c.default_cassette_options = { record: :none, match_requests_on: [:path] }
end

Expand Down

0 comments on commit f5fbcf6

Please sign in to comment.