Skip to content

Commit

Permalink
get through errors and display jobs on the website
Browse files Browse the repository at this point in the history
  • Loading branch information
honzajavorek committed Mar 4, 2022
1 parent cba9be4 commit 67e0441
Show file tree
Hide file tree
Showing 4 changed files with 169 additions and 159 deletions.
161 changes: 161 additions & 0 deletions juniorguru/lib/locations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
import re
from functools import lru_cache, wraps

import requests
from lxml import etree

from juniorguru.lib import loggers


logger = loggers.get(__name__)


# https://docs.python-requests.org/en/master/user/advanced/#timeouts
MAPYCZ_REQUEST_TIMEOUT = (3.05, 27)

USER_AGENT = 'JuniorGuruBot (+https://junior.guru)'

OPTIMIZATIONS = [
(re.compile(pattern), value) for pattern, value in [
(r'\bPraha\b', {'place': 'Praha', 'region': 'Praha', 'country': 'Česko'}),
(r'\bPrague\b', {'place': 'Praha', 'region': 'Praha', 'country': 'Česko'}),
(r'\bBrno\b', {'place': 'Brno', 'region': 'Brno', 'country': 'Česko'}),
(r'\bOstrava\b', {'place': 'Ostrava', 'region': 'Ostrava', 'country': 'Česko'}),
]
]

REGIONS_MAPPING = {
# countries
'Deutschland': 'Německo',
'Polska': 'Polsko',
'Österreich': 'Rakousko',

# regions
'Hlavní město Praha': 'Praha',
'Středočeský kraj': 'Praha',
'Jihočeský kraj': 'České Budějovice',
'Plzeňský kraj': 'Plzeň',
'Karlovarský kraj': 'Karlovy Vary',
'Ústecký kraj': 'Ústí nad Labem',
'Liberecký kraj': 'Liberec',
'Královéhradecký kraj': 'Hradec Králové',
'Pardubický kraj': 'Pardubice',
'Olomoucký kraj': 'Olomouc',
'Moravskoslezský kraj': 'Ostrava',
'Jihomoravský kraj': 'Brno',
'Zlínský kraj': 'Zlín',
'Kraj Vysočina': 'Jihlava',
}

ADDRESS_TYPES_MAPPING = {
# Mapy.cz
'muni': 'place',
'regi': 'region',
'coun': 'country',

# OpenStreetMaps
'osmm': 'place',
'osmr': 'region',
'osmc': 'country',
}


class GeocodeError(Exception):
pass


def fetch_locations(locations_raw, **kwargs):
parse_results = [fetch_location(location_raw, **kwargs)
for location_raw in locations_raw]
parse_results = set(filter(None, parse_results))
return [dict(name=name, region=region)
for name, region in parse_results]


def fetch_location(location_raw, geocode=None, debug_info=None):
geocode = geocode or geocode_mapycz
try:
logger.debug(f"Geocoding '{location_raw}'")
address = geocode(location_raw)
if address:
try:
return (address['place'], get_region(address))
except KeyError as e:
raise KeyError(f"{address!r} doesn't have key {e}") from e
except Exception:
debug_suffix = f', {debug_info!r}' if debug_info else ''
logger.exception(f"Geocoding '{location_raw}' failed{debug_suffix}")


def optimize_geocoding(geocode):
@wraps(geocode)
def wrapper(location_raw):
for location_re, value in OPTIMIZATIONS:
if location_re.search(location_raw):
return value
return lru_cache(geocode)(location_raw)
return wrapper


@optimize_geocoding
def geocode_mapycz(location_raw):
try:
logger.debug(f"Geocoding '{location_raw}' using api.mapy.cz/geocode")
response = requests.get('https://api.mapy.cz/geocode',
params={'query': location_raw},
headers={'User-Agent': USER_AGENT},
timeout=MAPYCZ_REQUEST_TIMEOUT)
response.raise_for_status()

xml = etree.fromstring(response.content)
items = xml.xpath('//item')
if not items:
return None

item = items[0]
title, lat, lng = item.get('title'), item.get('y'), item.get('x')
except requests.RequestException as e:
raise GeocodeError(f"Unable to geocode '{location_raw}'") from e

try:
logger.debug(f"Reverse geocoding '{location_raw}' lat: {lat} lng: {lng} using api.mapy.cz/rgeocode")
response = requests.get('https://api.mapy.cz/rgeocode',
params={'lat': lat, 'lon': lng},
headers={'User-Agent': USER_AGENT},
timeout=MAPYCZ_REQUEST_TIMEOUT)
response.raise_for_status()

xml = etree.fromstring(response.content)
items = xml.xpath('//item')
if not items:
raise ValueError('No items in the reverse geocode response')

address = {ADDRESS_TYPES_MAPPING[item.attrib['type']]: item.attrib['name']
for item in items if item.attrib['type'] in ADDRESS_TYPES_MAPPING}
return address
except requests.RequestException as e:
raise GeocodeError(f"Unable to geocode '{location_raw}' (unable to reverse geocode '{title}' lat: {lat} lng: {lng})") from e


def get_region(address):
if address['country'].lower().startswith('česk'):
region = address['region']
else:
region = address['country']
return REGIONS_MAPPING.get(region, region)


if __name__ == '__main__':
"""
Usage:
poetry run python -m juniorguru.lib.locations 'Brno, South Moravia'
"""
import sys
from pprint import pprint

location_raw = sys.argv[1]
print('geocode()')
pprint(geocode_mapycz(location_raw))
print('---\nprocess()')
pprint(fetch_locations([location_raw]))
1 change: 0 additions & 1 deletion juniorguru/models/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,6 @@ def tags(self, today=None):
@property
def location(self):
# TODO refactor, this is terrible
print(self.locations)
if not self.locations:
return '?'
if len(self.locations) == 1:
Expand Down
159 changes: 4 additions & 155 deletions juniorguru/sync/jobs_scraped/pipelines/locations.py
Original file line number Diff line number Diff line change
@@ -1,163 +1,12 @@
import re
from functools import lru_cache, wraps

import requests
from lxml import etree

from juniorguru.lib import loggers
from juniorguru.sync.scrape_jobs.settings import USER_AGENT
from juniorguru.lib.locations import fetch_locations


logger = loggers.get(__name__)


# https://docs.python-requests.org/en/master/user/advanced/#timeouts
MAPYCZ_REQUEST_TIMEOUT = (3.05, 27)


OPTIMIZATIONS = [
(re.compile(pattern), value) for pattern, value in [
(r'\bPraha\b', {'place': 'Praha', 'region': 'Praha', 'country': 'Česko'}),
(r'\bPrague\b', {'place': 'Praha', 'region': 'Praha', 'country': 'Česko'}),
(r'\bBrno\b', {'place': 'Brno', 'region': 'Brno', 'country': 'Česko'}),
(r'\bOstrava\b', {'place': 'Ostrava', 'region': 'Ostrava', 'country': 'Česko'}),
]
]

REGIONS_MAPPING = {
# countries
'Deutschland': 'Německo',
'Polska': 'Polsko',
'Österreich': 'Rakousko',

# regions
'Hlavní město Praha': 'Praha',
'Středočeský kraj': 'Praha',
'Jihočeský kraj': 'České Budějovice',
'Plzeňský kraj': 'Plzeň',
'Karlovarský kraj': 'Karlovy Vary',
'Ústecký kraj': 'Ústí nad Labem',
'Liberecký kraj': 'Liberec',
'Královéhradecký kraj': 'Hradec Králové',
'Pardubický kraj': 'Pardubice',
'Olomoucký kraj': 'Olomouc',
'Moravskoslezský kraj': 'Ostrava',
'Jihomoravský kraj': 'Brno',
'Zlínský kraj': 'Zlín',
'Kraj Vysočina': 'Jihlava',
}

ADDRESS_TYPES_MAPPING = {
# Mapy.cz
'muni': 'place',
'regi': 'region',
'coun': 'country',

# OpenStreetMaps
'osmm': 'place',
'osmr': 'region',
'osmc': 'country',
}


class GeocodeError(Exception):
pass


def process(item, geocode=None):
def process(item, **kwargs):
debug_info = dict(title=item.get('title'), company=item.get('company_name'))
location_tuples = [parse_location(loc, geocode, debug_info=debug_info)
for loc in item.get('locations_raw', [])]
location_tuples = set(filter(None, location_tuples))
item['locations'] = [dict(name=name, region=region)
for name, region in location_tuples]
locations_raw = item.get('locations_raw', [])
item['locations'] = fetch_locations(locations_raw, debug_info=debug_info, **kwargs)
return item


def parse_location(location_raw, geocode=None, debug_info=None):
geocode = geocode or geocode_mapycz
try:
logger.debug(f"Geocoding '{location_raw}'")
address = geocode(location_raw)
if address:
try:
return (address['place'], get_region(address))
except KeyError as e:
raise KeyError(f"{address!r} doesn't have key {e}") from e
except Exception:
debug_suffix = f', {debug_info!r}' if debug_info else ''
logger.exception(f"Geocoding '{location_raw}' failed{debug_suffix}")


def optimize_geocoding(geocode):
@wraps(geocode)
def wrapper(location_raw):
for location_re, value in OPTIMIZATIONS:
if location_re.search(location_raw):
return value
return lru_cache(geocode)(location_raw)
return wrapper


@optimize_geocoding
def geocode_mapycz(location_raw):
try:
logger.debug(f"Geocoding '{location_raw}' using api.mapy.cz/geocode")
response = requests.get('https://api.mapy.cz/geocode',
params={'query': location_raw},
headers={'User-Agent': USER_AGENT},
timeout=MAPYCZ_REQUEST_TIMEOUT)
response.raise_for_status()

xml = etree.fromstring(response.content)
items = xml.xpath('//item')
if not items:
return None

item = items[0]
title, lat, lng = item.get('title'), item.get('y'), item.get('x')
except requests.RequestException as e:
raise GeocodeError(f"Unable to geocode '{location_raw}'") from e

try:
logger.debug(f"Reverse geocoding '{location_raw}' lat: {lat} lng: {lng} using api.mapy.cz/rgeocode")
response = requests.get('https://api.mapy.cz/rgeocode',
params={'lat': lat, 'lon': lng},
headers={'User-Agent': USER_AGENT},
timeout=MAPYCZ_REQUEST_TIMEOUT)
response.raise_for_status()

xml = etree.fromstring(response.content)
items = xml.xpath('//item')
if not items:
raise ValueError('No items in the reverse geocode response')

address = {ADDRESS_TYPES_MAPPING[item.attrib['type']]: item.attrib['name']
for item in items if item.attrib['type'] in ADDRESS_TYPES_MAPPING}
return address
except requests.RequestException as e:
raise GeocodeError(f"Unable to geocode '{location_raw}' (unable to reverse geocode '{title}' lat: {lat} lng: {lng})") from e


def get_region(address):
if address['country'].lower().startswith('česk'):
region = address['region']
else:
region = address['country']
return REGIONS_MAPPING.get(region, region)


if __name__ == '__main__':
"""
Usage:
poetry run python -m juniorguru.sync.jobs_scraped.pipelines.locations 'Brno, South Moravia'
"""
import sys
from pprint import pprint

location_raw = sys.argv[1]
print('geocode()')
pprint(geocode_mapycz(location_raw))
print('---\nprocess()')
pprint(process({'locations_raw': [location_raw]}))
7 changes: 4 additions & 3 deletions juniorguru/sync/jobs_submitted.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from juniorguru.sync.scrape_jobs.pipelines.language_parser import parse as parse_language
from juniorguru.sync.jobs_scraped.pipelines.boards_ids import parse_urls as parse_board_ids
from juniorguru.sync.jobs_scraped.pipelines.employment_types_cleaner import clean as clean_employment_types
from juniorguru.sync.jobs_scraped.pipelines.locations import parse_location
from juniorguru.lib.locations import fetch_locations


logger = loggers.get(__name__)
Expand Down Expand Up @@ -55,7 +55,7 @@ def coerce_record(record, today=None):
r'^externí odkaz na pracovní nabídku$': ('apply_url', parse_url),
r'^název firmy$': ('company_name', parse_text),
r'^odkaz na webové stránky firmy$': ('company_url', parse_url),
r'^město, kde se nachází kancelář$': ('locations', parse_locations),
r'^město, kde se nachází kancelář$': ('locations_raw', parse_locations),
r'^je práce na dálku\?$': ('remote', parse_boolean_words),
r'^pracovní poměr$': ('employment_types', parse_employment_types),
r'^text pracovní nabídky$': ('description_html', parse_markdown),
Expand All @@ -79,12 +79,13 @@ def coerce_record(record, today=None):
data['boards_ids'] = parse_board_ids(urls)

data['lang'] = parse_language(data['description_html'])
data['locations'] = fetch_locations(data['locations_raw'])
return data


def parse_locations(value):
if value:
return [parse_location(loc.strip()) for loc in re.split(r'\snebo\s', value)]
return [loc.strip() for loc in re.split(r'\snebo\s', value)]
return []


Expand Down

0 comments on commit 67e0441

Please sign in to comment.