From c2c77ad44d34e63402014d69d6ff42f696507f76 Mon Sep 17 00:00:00 2001 From: Myst <1592048+LeMyst@users.noreply.github.com> Date: Sun, 9 May 2021 22:26:24 +0200 Subject: [PATCH] Change and remove FunctionsEngine functions (#129) * Update delete_statement BREAKING CHANGES Change parameters order revision is now optional Add summary parameter * Update delete_item BREAKING CHANGE Change parameter order reason is now optional * Update wbi_core.py Move format force to json to mediawiki_api_call() Add Exception if format is not json * Update wbi_core.py Remove FunctionsEngine.get_linked_by() and FunctionsEngine.delete_item() WikibaseIntegrator will only support wikibase-oriented functions. * Update wbi_core.py Move _sparql_query_result_to_df() to the end of FunctionsEngine * Rename functions * Rename delete_statements to remove_claims * Rename get_search_results to search_entities * Update wbi_core.py * Remove dataframe support Use sparqldataframe instead * Corretly remove dataframe support * Bump to v0.11.0.dev0 Support python 3.10 * Split wbi_core into exceptions and functions * Update wbi_functions.py Sync with master branch * Add 3.10 to Code Compatibility Inspection * For to Python 3.10.0-alpha.7 Because beta1 from Github actions is bogus * Add MediaWiki API Call example in README.md * Bogus version has been removed * I tried * Typo in comments * Update README.md --- .github/workflows/python-package.yml | 6 +- .../inspectionProfiles/WikibaseIntegrator.xml | 3 +- README.md | 44 +- requirements.txt | 1 - setup.cfg | 4 +- test/test_all.py | 12 +- test/test_wbi_core.py | 9 +- test/test_wbi_login.py | 4 +- wikibaseintegrator/wbi_core.py | 567 +----------------- wikibaseintegrator/wbi_exceptions.py | 88 +++ wikibaseintegrator/wbi_fastrun.py | 12 +- wikibaseintegrator/wbi_functions.py | 394 ++++++++++++ 12 files changed, 547 insertions(+), 597 deletions(-) create mode 100644 wikibaseintegrator/wbi_exceptions.py create mode 100644 wikibaseintegrator/wbi_functions.py diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 7232087e..5d733871 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -4,16 +4,14 @@ on: push: branches: [ master ] pull_request: - branches: '**' + branches: [ '**' ] jobs: build: runs-on: ubuntu-latest strategy: matrix: - python-version: [ '3.7', '3.8', '3.9' ] - # Add this when Numpy add support to Python 3.10 - # python-version: [ '3.7', '3.8', '3.9', '3.10-dev' ] + python-version: [ '3.7', '3.8', '3.9', '3.10-dev' ] steps: - uses: actions/checkout@v2 diff --git a/.idea/inspectionProfiles/WikibaseIntegrator.xml b/.idea/inspectionProfiles/WikibaseIntegrator.xml index b603a264..c122a9d0 100644 --- a/.idea/inspectionProfiles/WikibaseIntegrator.xml +++ b/.idea/inspectionProfiles/WikibaseIntegrator.xml @@ -32,10 +32,11 @@ diff --git a/README.md b/README.md index e52d1bbd..fec13fd9 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,8 @@ - [Using a Wikibase instance](#using-a-wikibase-instance) - [The Core Parts](#the-core-parts) - [wbi_core.ItemEngine](#wbi_coreitemengine) - - [wbi_core.FunctionsEngine](#wbi_corefunctionsengine) + - [wbi_functions](#wbi_functions) + - [Use MediaWiki API](#use-mediawiki-api) - [wbi_login.Login](#wbi_loginlogin) - [Login using OAuth1 or OAuth2](#login-using-oauth1-or-oauth2) - [Login with a username and a password](#login-with-a-username-and-a-password) @@ -119,18 +120,37 @@ There are two ways of working with Wikibase items: * A user can work with a selected QID to specifically modify the data on the item. This requires that the user knows what he/she is doing and should only be used with great care, as this does not perform consistency checks. -Examples below illustrate the usage of ItemEngine. +## wbi_functions ## -## wbi_core.FunctionsEngine ## - -wbi_core.FunctionsEngine provides a set of static functions to request or manipulate data from MediaWiki API or SPARQL -Service. +wbi_functions provides a set of static functions to request or manipulate data from MediaWiki API or SPARQL Service. Features: * Minimize the number of HTTP requests for reads and writes to improve performance * Method to easily execute [SPARQL](https://query.wikidata.org) queries on the Wikibase SPARQL endpoint. +### Use MediaWiki API ### + +WikibaseIntegrator don't have functions to make API call to non-wikibase actions. You can +use `wbi_functions.mediawiki_api_call_helper()` to make a custom call. + +Example to get the last two revisions of entity Q42 : + +```python +from wikibaseintegrator import wbi_functions + +data = { + 'action': 'query', + 'prop': 'revisions', + 'titles': 'Q42', + 'rvlimit': 2, + 'rvprop': 'ids|timestamp|comment|user', + 'rvslots': 'main' +} + +print(wbi_functions.mediawiki_api_call_helper(data, allow_anonymous=True)) +``` + ## wbi_login.Login ## ### Login using OAuth1 or OAuth2 ### @@ -242,8 +262,8 @@ address, or the URL to your bot code repository.) ## Use Mediawiki API ## -The method `wbi_core.FunctionsEngine.mediawiki_api_call_helper()` allows you to execute MediaWiki API POST call. It -takes a mandatory data array (data) and multiple optionals parameters like a login object of type wbi_login.Login, a +The method `wbi_functions.mediawiki_api_call_helper()` allows you to execute MediaWiki API POST call. It takes a +mandatory data array (data) and multiple optionals parameters like a login object of type wbi_login.Login, a mediawiki_api_url string if the Mediawiki is not Wikidata, a user_agent string to set a custom HTTP User Agent header, and an allow_anonymous boolean to force authentication. @@ -252,7 +272,7 @@ Example: Retrieve last 10 revisions from Wikidata element Q2 (Earth): ```python -from wikibaseintegrator import wbi_core +from wikibaseintegrator import wbi_functions query = { 'action': 'query', @@ -261,12 +281,12 @@ query = { 'rvlimit': 10 } -print(wbi_core.FunctionsEngine.mediawiki_api_call_helper(query, allow_anonymous=True)) +print(wbi_functions.mediawiki_api_call_helper(query, allow_anonymous=True)) ``` ## Wikibase search entities ## -The method `wbi_core.ItemEngine.get_search_results()` allows for string search in a Wikibase instance. This means that +The method `wbi_core.ItemEngine.search_entities()` allows for string search in a Wikibase instance. This means that labels, descriptions and aliases can be searched for a string of interest. The method takes five arguments: The actual search string (search_string), an optional server (mediawiki_api_url, in case the Wikibase instance used is not Wikidata), an optional user_agent, an optional max_results (default 500), an optional language (default 'en'), and an @@ -275,7 +295,7 @@ option dict_id_label to return a dict of item id and label as a result. ## Merge Wikibase items ## Sometimes, Wikibase items need to be merged. An API call exists for that, and wbi_core implements a method accordingly. -`wbi_core.FunctionsEngine.merge_items()` takes five arguments: +`wbi_functions.merge_items()` takes five arguments: the QID of the item which should be merged into another item (from_id), the QID of the item the first item should be merged into (to_id), a login object of type wbi_login.Login to provide the API call with the required authentication information, a server (mediawiki_api_url) if the Wikibase instance is not Wikidata and a flag for ignoring merge diff --git a/requirements.txt b/requirements.txt index 33821f41..c87cd17c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,5 @@ simplejson~=3.17.2 requests~=2.25.1 -pandas~=1.2.4 mwoauth~=0.3.7 backoff~=1.10.0 pytest~=6.2.4 diff --git a/setup.cfg b/setup.cfg index d532608a..068e1028 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = wikibaseintegrator -version = 0.10.2.dev0 +version = 0.11.0.dev0 author = Myst and WikidataIntegrator authors license = MIT license_files = LICENSE.txt @@ -17,6 +17,7 @@ classifiers = Programming Language :: Python :: 3.7 Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 + Programming Language :: Python :: 3.10 Development Status :: 4 - Beta Operating System :: POSIX Operating System :: MacOS :: MacOS X @@ -32,7 +33,6 @@ packages = wikibaseintegrator install_requires = simplejson requests - pandas mwoauth backoff oauthlib diff --git a/test/test_all.py b/test/test_all.py index 97a3cf09..b61acb31 100644 --- a/test/test_all.py +++ b/test/test_all.py @@ -4,7 +4,7 @@ import requests -from wikibaseintegrator import wbi_core, wbi_fastrun +from wikibaseintegrator import wbi_core, wbi_fastrun, wbi_functions from wikibaseintegrator.wbi_core import MWApiError __author__ = 'Sebastian Burgstaller-Muehlbacher' @@ -14,13 +14,13 @@ class TestMediawikiApiCall(unittest.TestCase): def test_all(self): with self.assertRaises(MWApiError): - wbi_core.FunctionsEngine.mediawiki_api_call_helper(data={'format': 'json', 'action': 'wbgetentities', 'ids': 'Q42'}, mediawiki_api_url="https://www.wikidataaaaaaa.org", - max_retries=3, retry_after=1, allow_anonymous=True) + wbi_functions.mediawiki_api_call_helper(data={'format': 'json', 'action': 'wbgetentities', 'ids': 'Q42'}, mediawiki_api_url="https://www.wikidataaaaaaa.org", + max_retries=3, retry_after=1, allow_anonymous=True) with self.assertRaises(requests.HTTPError): - wbi_core.FunctionsEngine.mediawiki_api_call_helper(data=None, mediawiki_api_url="https://httpbin.org/status/400", max_retries=3, retry_after=1, allow_anonymous=True) + wbi_functions.mediawiki_api_call_helper(data=None, mediawiki_api_url="https://httpbin.org/status/400", max_retries=3, retry_after=1, allow_anonymous=True) - test = wbi_core.FunctionsEngine.mediawiki_api_call_helper(data={'format': 'json', 'action': 'wbgetentities', 'ids': 'Q42'}, max_retries=3, retry_after=1, - allow_anonymous=True) + test = wbi_functions.mediawiki_api_call_helper(data={'format': 'json', 'action': 'wbgetentities', 'ids': 'Q42'}, max_retries=3, retry_after=1, + allow_anonymous=True) print(test) diff --git a/test/test_wbi_core.py b/test/test_wbi_core.py index cfdd8a24..43260227 100644 --- a/test/test_wbi_core.py +++ b/test/test_wbi_core.py @@ -1,6 +1,6 @@ import unittest -from wikibaseintegrator import wbi_core +from wikibaseintegrator import wbi_core, wbi_functions class TestWbiCore(unittest.TestCase): @@ -105,14 +105,14 @@ def test_label(self): assert item.get_aliases('ak') == ['c'] def test_wd_search(self): - t = wbi_core.FunctionsEngine.get_search_results('rivaroxaban') + t = wbi_functions.search_entities('rivaroxaban') print('Number of results: ', len(t)) self.assertIsNot(len(t), 0) def test_item_generator(self): items = ['Q408883', 'P715', 'Q18046452'] - item_instances = wbi_core.FunctionsEngine.generate_item_instances(items=items) + item_instances = wbi_functions.generate_item_instances(items=items) for qid, item in item_instances: self.assertIn(qid, items) @@ -181,6 +181,3 @@ def test_get_reference_properties(self): def test_get_qualifier_properties(self): print(self.common_item.get_qualifier_properties(prop_id='P170')) self.assertTrue(len(self.common_item.get_qualifier_properties(prop_id='P2067'))) - - def test_get_linked_by(self): - self.assertTrue(len(wbi_core.FunctionsEngine.get_linked_by('Q2'))) diff --git a/test/test_wbi_login.py b/test/test_wbi_login.py index 16e13e81..c7894ac3 100644 --- a/test/test_wbi_login.py +++ b/test/test_wbi_login.py @@ -5,7 +5,7 @@ import pytest -from wikibaseintegrator import wbi_login, wbi_core +from wikibaseintegrator import wbi_login, wbi_functions # look for environment variables. if none set, don't do anything WDUSER = os.getenv("WDUSER") @@ -23,4 +23,4 @@ def test_write(): if WDUSER and WDPASS: login = wbi_login.Login(WDUSER, WDPASS) with pytest.raises(ValueError): - wbi_core.FunctionsEngine.mediawiki_api_call_helper(data=None, login=login, mediawiki_api_url='https://unsdfdskfjljzkerezr.org/w/api.php') + wbi_functions.mediawiki_api_call_helper(data=None, login=login, mediawiki_api_url='https://unsdfdskfjljzkerezr.org/w/api.php') diff --git a/wikibaseintegrator/wbi_core.py b/wikibaseintegrator/wbi_core.py index 407f837f..e24cb754 100644 --- a/wikibaseintegrator/wbi_core.py +++ b/wikibaseintegrator/wbi_core.py @@ -1,17 +1,12 @@ import copy -import datetime import json import re from collections import defaultdict -from time import sleep -from warnings import warn -import pandas -import requests - -from wikibaseintegrator import wbi_login -from wikibaseintegrator.wbi_backoff import wbi_backoff +from wikibaseintegrator import wbi_functions from wikibaseintegrator.wbi_config import config +from wikibaseintegrator.wbi_exceptions import IDMissingError, SearchError, SearchOnlyError, NonUniqueLabelDescriptionPairError, MWApiError, CorePropIntegrityException, \ + ManualInterventionReqException from wikibaseintegrator.wbi_fastrun import FastRunContainer @@ -142,10 +137,10 @@ def __init__(self, item_id='', new_item=False, data=None, mediawiki_api_url=None raise ValueError("If using a custom ref mode, ref_handler must be set") if (core_props is None) and (self.sparql_endpoint_url not in ItemEngine.distinct_value_props): - ItemEngine.distinct_value_props[self.sparql_endpoint_url] = FunctionsEngine.get_distinct_value_props(self.sparql_endpoint_url, - self.wikibase_url, - self.property_constraint_pid, - self.distinct_values_constraint_qid) + ItemEngine.distinct_value_props[self.sparql_endpoint_url] = wbi_functions.get_distinct_value_props(self.sparql_endpoint_url, + self.wikibase_url, + self.property_constraint_pid, + self.distinct_values_constraint_qid) self.core_props = core_props if core_props is not None else ItemEngine.distinct_value_props[self.sparql_endpoint_url] if self.fast_run: @@ -308,7 +303,7 @@ def get_entity(self): 'format': 'json' } - json_data = FunctionsEngine.mediawiki_api_call_helper(data=params, allow_anonymous=True) + json_data = wbi_functions.mediawiki_api_call_helper(data=params, allow_anonymous=True) return self.parse_json(json_data=json_data['entities'][self.item_id]) def get_property_list(self): @@ -652,7 +647,7 @@ def write(self, login, bot_account=True, edit_summary='', entity_type='item', pr print(payload) try: - json_data = FunctionsEngine.mediawiki_api_call_helper(data=payload, login=login, max_retries=max_retries, retry_after=retry_after, allow_anonymous=allow_anonymous) + json_data = wbi_functions.mediawiki_api_call_helper(data=payload, login=login, max_retries=max_retries, retry_after=retry_after, allow_anonymous=allow_anonymous) if 'error' in json_data and 'messages' in json_data['error']: error_msg_names = set(x.get('name') for x in json_data['error']['messages']) @@ -739,7 +734,7 @@ def __select_item(self): if property_nr in core_props: tmp_qids = set() query = statement.sparql_query.format(wb_url=self.wikibase_url, pid=property_nr, value=statement.get_sparql_value().replace("'", r"\'")) - results = FunctionsEngine.execute_sparql_query(query=query, endpoint=self.sparql_endpoint_url, debug=self.debug) + results = wbi_functions.execute_sparql_query(query=query, endpoint=self.sparql_endpoint_url, debug=self.debug) for i in results['results']['bindings']: qid = i['item_id']['value'].split('/')[-1] @@ -934,458 +929,6 @@ def __repr__(self): ) -class FunctionsEngine(object): - - @staticmethod - def mediawiki_api_call(method, mediawiki_api_url=None, session=None, max_retries=1000, retry_after=60, **kwargs): - """ - :param method: 'GET' or 'POST' - :param mediawiki_api_url: - :param session: If a session is passed, it will be used. Otherwise a new requests session is created - :param max_retries: If api request fails due to rate limiting, maxlag, or readonly mode, retry up to - `max_retries` times - :type max_retries: int - :param retry_after: Number of seconds to wait before retrying request (see max_retries) - :type retry_after: int - :param kwargs: Passed to requests.request - :return: - """ - - mediawiki_api_url = config['MEDIAWIKI_API_URL'] if mediawiki_api_url is None else mediawiki_api_url - - # TODO: Add support for 'multipart/form-data' when using POST (https://www.mediawiki.org/wiki/API:Edit#Large_edits) - - response = None - session = session if session else requests.session() - for n in range(max_retries): - try: - response = session.request(method, mediawiki_api_url, **kwargs) - except requests.exceptions.ConnectionError as e: - print("Connection error: {}. Sleeping for {} seconds.".format(e, retry_after)) - sleep(retry_after) - continue - if response.status_code == 503: - print("service unavailable. sleeping for {} seconds".format(retry_after)) - sleep(retry_after) - continue - - response.raise_for_status() - json_data = response.json() - """ - Mediawiki api response has code = 200 even if there are errors. - rate limit doesn't return HTTP 429 either. may in the future - https://phabricator.wikimedia.org/T172293 - """ - if 'error' in json_data: - # rate limiting - error_msg_names = set() - if 'messages' in json_data['error']: - error_msg_names = set(x.get('name') for x in json_data['error']['messages']) - if 'actionthrottledtext' in error_msg_names: - sleep_sec = int(response.headers.get('retry-after', retry_after)) - print("{}: rate limited. sleeping for {} seconds".format(datetime.datetime.utcnow(), sleep_sec)) - sleep(sleep_sec) - continue - - # maxlag - if 'code' in json_data['error'] and json_data['error']['code'] == 'maxlag': - sleep_sec = json_data['error'].get('lag', retry_after) - print("{}: maxlag. sleeping for {} seconds".format(datetime.datetime.utcnow(), sleep_sec)) - sleep(sleep_sec) - continue - - # readonly - if 'code' in json_data['error'] and json_data['error']['code'] == 'readonly': - print('The Wikibase instance is currently in readonly mode, waiting for {} seconds'.format(retry_after)) - sleep(retry_after) - continue - - # others case - raise MWApiError(response.json() if response else {}) - - # there is no error or waiting. break out of this loop and parse response - break - else: - # the first time I've ever used for - else!! - # else executes if the for loop completes normally. i.e. does not encouter a `break` - # in this case, that means it tried this api call 10 times - raise MWApiError(response.json() if response else {}) - - return json_data - - @staticmethod - def mediawiki_api_call_helper(data, login=None, mediawiki_api_url=None, user_agent=None, allow_anonymous=False, max_retries=1000, retry_after=60): - mediawiki_api_url = config['MEDIAWIKI_API_URL'] if mediawiki_api_url is None else mediawiki_api_url - user_agent = config['USER_AGENT_DEFAULT'] if user_agent is None else user_agent - - if not allow_anonymous: - if login is None: - # Force allow_anonymous as False by default to ask for a login object - raise ValueError("allow_anonymous can't be False and login is None at the same time.") - elif mediawiki_api_url != login.mediawiki_api_url: - raise ValueError("mediawiki_api_url can't be different with the one in the login object.") - - headers = { - 'User-Agent': user_agent - } - - if data is not None: - # format can only be json when using mediawiki_api_call() - if 'format' not in data: - data.update({'format': 'json'}) - - if login is not None and 'token' not in data: - data.update({'token': login.get_edit_token()}) - - if not allow_anonymous: - # Always assert user if allow_anonymous is False - if 'assert' not in data: - data.update({'assert': 'user'}) - if 'token' in data and data['token'] == '+\\': - raise wbi_login.LoginError("Anonymous edit are not allowed by default. Set allow_anonymous to True to edit mediawiki anonymously.") - elif 'assert' not in data: - # Always assert anon if allow_anonymous is True - data.update({'assert': 'anon'}) - - login_session = login.get_session() if login is not None else None - - return FunctionsEngine.mediawiki_api_call('POST', mediawiki_api_url, login_session, data=data, headers=headers, max_retries=max_retries, retry_after=retry_after) - - @staticmethod - @wbi_backoff() - def execute_sparql_query(query, prefix=None, endpoint=None, user_agent=None, as_dataframe=False, max_retries=1000, retry_after=60, debug=False): - """ - Static method which can be used to execute any SPARQL query - :param prefix: The URI prefixes required for an endpoint, default is the Wikidata specific prefixes - :param query: The actual SPARQL query string - :param endpoint: The URL string for the SPARQL endpoint. Default is the URL for the Wikidata SPARQL endpoint - :param user_agent: Set a user agent string for the HTTP header to let the Query Service know who you are. - :type user_agent: str - :param as_dataframe: Return result as pandas dataframe - :param max_retries: The number time this function should retry in case of header reports. - :param retry_after: the number of seconds should wait upon receiving either an error code or the Query Service is not reachable. - :param debug: Enable debug output. - :type debug: boolean - :return: The results of the query are returned in JSON format - """ - - sparql_endpoint_url = config['SPARQL_ENDPOINT_URL'] if endpoint is None else endpoint - user_agent = config['USER_AGENT_DEFAULT'] if user_agent is None else user_agent - - if prefix: - query = prefix + '\n' + query - - params = { - 'query': '#Tool: wbi_core execute_sparql_query\n' + query, - 'format': 'json' - } - - headers = { - 'Accept': 'application/sparql-results+json', - 'User-Agent': user_agent - } - - if debug: - print(params['query']) - - for n in range(max_retries): - try: - response = requests.post(sparql_endpoint_url, params=params, headers=headers) - except requests.exceptions.ConnectionError as e: - print("Connection error: {}. Sleeping for {} seconds.".format(e, retry_after)) - sleep(retry_after) - continue - if response.status_code == 503: - print("Service unavailable (503). Sleeping for {} seconds".format(retry_after)) - sleep(retry_after) - continue - if response.status_code == 429: - if 'retry-after' in response.headers.keys(): - retry_after = response.headers['retry-after'] - print("Too Many Requests (429). Sleeping for {} seconds".format(retry_after)) - sleep(retry_after) - continue - response.raise_for_status() - results = response.json() - - if as_dataframe: - return FunctionsEngine._sparql_query_result_to_df(results) - else: - return results - - @staticmethod - def _sparql_query_result_to_df(results): - - def parse_value(item): - if item.get('datatype') == 'http://www.w3.org/2001/XMLSchema#decimal': - return float(item['value']) - if item.get('datatype') == 'http://www.w3.org/2001/XMLSchema#integer': - return int(item['value']) - if item.get('datatype') == 'http://www.w3.org/2001/XMLSchema#dateTime': - return datetime.datetime.strptime(item['value'], '%Y-%m-%dT%H:%M:%SZ') - return item['value'] - - results = results['results']['bindings'] - results = [{k: parse_value(v) for k, v in item.items()} for item in results] - df = pandas.DataFrame(results) - return df - - @staticmethod - def get_linked_by(qid, mediawiki_api_url=None): - """ - :param qid: Wikidata identifier to which other wikidata items link - :param mediawiki_api_url: default to wikidata's api, but can be changed to any Wikibase - :return: - """ - - mediawiki_api_url = config['MEDIAWIKI_API_URL'] if mediawiki_api_url is None else mediawiki_api_url - - linkedby = [] - whatlinkshere = json.loads(requests.get(mediawiki_api_url + '?action=query&list=backlinks&format=json&bllimit=500&bltitle=' + qid).text) - for link in whatlinkshere['query']['backlinks']: - if link['title'].startswith('Q'): - linkedby.append(link['title']) - while 'continue' in whatlinkshere.keys(): - whatlinkshere = json.loads(requests.get(mediawiki_api_url + '?action=query&list=backlinks&blcontinue=' + - whatlinkshere['continue']['blcontinue'] + '&format=json&bllimit=500&bltitle=' + qid).text) - for link in whatlinkshere['query']['backlinks']: - if link['title'].startswith('Q'): - linkedby.append(link['title']) - return linkedby - - @staticmethod - def merge_items(from_id, to_id, login, ignore_conflicts='', mediawiki_api_url=None, user_agent=None, allow_anonymous=False): - """ - A static method to merge two items - :param from_id: The QID which should be merged into another item - :type from_id: string with 'Q' prefix - :param to_id: The QID into which another item should be merged - :type to_id: string with 'Q' prefix - :param login: The object containing the login credentials and cookies. An instance of wbi_login.Login. - :param mediawiki_api_url: The MediaWiki url which should be used - :type mediawiki_api_url: str - :param ignore_conflicts: A string with the values 'description', 'statement' or 'sitelink', separated by a pipe ('|') if using more than one of those. - :type ignore_conflicts: str - :param user_agent: Set a user agent string for the HTTP header to let the Query Service know who you are. - :type user_agent: str - :param allow_anonymous: Allow anonymous edit to the MediaWiki API. Disabled by default. - :type allow_anonymous: bool - """ - - params = { - 'action': 'wbmergeitems', - 'fromid': from_id, - 'toid': to_id, - 'token': login.get_edit_token(), - 'format': 'json', - 'bot': '', - 'ignoreconflicts': ignore_conflicts - } - - if config['MAXLAG'] > 0: - params.update({'maxlag': config['MAXLAG']}) - - return FunctionsEngine.mediawiki_api_call_helper(data=params, login=login, mediawiki_api_url=mediawiki_api_url, user_agent=user_agent, allow_anonymous=allow_anonymous) - - @staticmethod - def delete_item(item, reason, login, mediawiki_api_url=None, user_agent=None, allow_anonymous=False): - """ - Delete an item - :param item: a QID which should be deleted - :type item: string - :param reason: short text about the reason for the deletion request - :type reason: str - :param login: The object containing the login credentials and cookies. An instance of wbi_login.Login. - :param mediawiki_api_url: The MediaWiki url which should be used - :type mediawiki_api_url: str - :param user_agent: Set a user agent string for the HTTP header to let the Query Service know who you are. - :type user_agent: str - :param allow_anonymous: Allow anonymous edit to the MediaWiki API. Disabled by default. - :type allow_anonymous: bool - """ - - params = { - 'action': 'delete', - 'title': 'Item:' + item, - 'reason': reason, - 'token': login.get_edit_token(), - 'format': 'json' - } - - if config['MAXLAG'] > 0: - params.update({'maxlag': config['MAXLAG']}) - - return FunctionsEngine.mediawiki_api_call_helper(data=params, login=login, mediawiki_api_url=mediawiki_api_url, user_agent=user_agent, allow_anonymous=allow_anonymous) - - @staticmethod - def delete_statement(statement_id, revision, login, mediawiki_api_url=None, user_agent=None, allow_anonymous=False): - """ - Delete an item - :param statement_id: One GUID or several (pipe-separated) GUIDs identifying the claims to be removed. All claims must belong to the same entity. - :type statement_id: string - :param revision: The numeric identifier for the revision to base the modification on. This is used for detecting conflicts during save. - :type revision: str - :param login: The object containing the login credentials and cookies. An instance of wbi_login.Login. - :param mediawiki_api_url: The MediaWiki url which should be used - :type mediawiki_api_url: str - :param user_agent: Set a user agent string for the HTTP header to let the Query Service know who you are. - :type user_agent: str - :param allow_anonymous: Allow anonymous edit to the MediaWiki API. Disabled by default. - :type allow_anonymous: bool - """ - - params = { - 'action': 'wbremoveclaims', - 'claim': statement_id, - 'token': login.get_edit_token(), - 'baserevid': revision, - 'bot': True, - 'format': 'json' - } - - if config['MAXLAG'] > 0: - params.update({'maxlag': config['MAXLAG']}) - - return FunctionsEngine.mediawiki_api_call_helper(data=params, login=login, mediawiki_api_url=mediawiki_api_url, user_agent=user_agent, allow_anonymous=allow_anonymous) - - @staticmethod - def get_search_results(search_string='', search_type='item', mediawiki_api_url=None, user_agent=None, max_results=500, language=None, dict_result=False, allow_anonymous=True): - """ - Performs a search for entities in the Wikibase instance using labels and aliases. - :param search_string: a string which should be searched for in the Wikibase instance (labels and aliases) - :type search_string: str - :param search_type: Search for this type of entity. One of the following values: form, item, lexeme, property, sense - :type search_type: str - :param mediawiki_api_url: Specify the mediawiki_api_url. - :type mediawiki_api_url: str - :param user_agent: The user agent string transmitted in the http header - :type user_agent: str - :param max_results: The maximum number of search results returned. Default 500 - :type max_results: int - :param language: The language in which to perform the search. - :type language: str - :param dict_result: - :type dict_result: boolean - :param allow_anonymous: Allow anonymous edit to the MediaWiki API. Disabled by default. - :type allow_anonymous: bool - :return: list - """ - - language = config['DEFAULT_LANGUAGE'] if language is None else language - - params = { - 'action': 'wbsearchentities', - 'language': language, - 'search': search_string, - 'type': search_type, - 'format': 'json', - 'limit': 50 - } - - cont_count = 0 - results = [] - - while True: - params.update({'continue': cont_count}) - - search_results = FunctionsEngine.mediawiki_api_call_helper(data=params, mediawiki_api_url=mediawiki_api_url, user_agent=user_agent, allow_anonymous=allow_anonymous) - - if search_results['success'] != 1: - raise SearchError('Wikibase API wbsearchentities failed') - else: - for i in search_results['search']: - if dict_result: - description = i['description'] if 'description' in i else None - aliases = i['aliases'] if 'aliases' in i else None - results.append({ - 'id': i['id'], - 'label': i['label'], - 'match': i['match'], - 'description': description, - 'aliases': aliases - }) - else: - results.append(i['id']) - - if 'search-continue' not in search_results: - break - else: - cont_count = search_results['search-continue'] - - if cont_count >= max_results: - break - - return results - - @staticmethod - def generate_item_instances(items, mediawiki_api_url=None, login=None, user_agent=None, allow_anonymous=True): - """ - A method which allows for retrieval of a list of Wikidata items or properties. The method generates a list of - tuples where the first value in the tuple is the QID or property ID, whereas the second is the new instance of - ItemEngine containing all the data of the item. This is most useful for mass retrieval of items. - :param user_agent: A custom user agent - :type user_agent: str - :param items: A list of QIDs or property IDs - :type items: list - :param mediawiki_api_url: The MediaWiki url which should be used - :type mediawiki_api_url: str - :param login: The object containing the login credentials and cookies. An instance of wbi_login.Login. - :return: A list of tuples, first value in the tuple is the QID or property ID string, second value is the instance of ItemEngine with the corresponding - item data. - :param allow_anonymous: Allow anonymous edit to the MediaWiki API. Disabled by default. - :type allow_anonymous: bool - """ - - assert type(items) == list - - params = { - 'action': 'wbgetentities', - 'ids': '|'.join(items), - 'format': 'json' - } - - reply = FunctionsEngine.mediawiki_api_call_helper(data=params, login=login, mediawiki_api_url=mediawiki_api_url, user_agent=user_agent, allow_anonymous=allow_anonymous) - - item_instances = [] - for qid, v in reply['entities'].items(): - ii = ItemEngine(item_id=qid, item_data=v) - ii.mediawiki_api_url = mediawiki_api_url - item_instances.append((qid, ii)) - - return item_instances - - @staticmethod - def get_distinct_value_props(sparql_endpoint_url=None, wikibase_url=None, property_constraint_pid=None, - distinct_values_constraint_qid=None): - """ - On wikidata, the default core IDs will be the properties with a distinct values constraint select ?p where {?p wdt:P2302 wd:Q21502410} - See: https://www.wikidata.org/wiki/Help:Property_constraints_portal - https://www.wikidata.org/wiki/Help:Property_constraints_portal/Unique_value - """ - - wikibase_url = config['WIKIBASE_URL'] if wikibase_url is None else wikibase_url - property_constraint_pid = config['PROPERTY_CONSTRAINT_PID'] if property_constraint_pid is None else property_constraint_pid - distinct_values_constraint_qid = config['DISTINCT_VALUES_CONSTRAINT_QID'] if distinct_values_constraint_qid is None else distinct_values_constraint_qid - - pcpid = property_constraint_pid - dvcqid = distinct_values_constraint_qid - - query = ''' - SELECT ?p WHERE {{ - ?p <{wb_url}/prop/direct/{prop_nr}> <{wb_url}/entity/{entity}> - }} - '''.format(wb_url=wikibase_url, prop_nr=pcpid, entity=dvcqid) - df = FunctionsEngine.execute_sparql_query(query, endpoint=sparql_endpoint_url, as_dataframe=True) - if df.empty: - warn("Warning: No distinct value properties found\n" + - "Please set P2302 and Q21502410 in your Wikibase or set `core_props` manually.\n" + - "Continuing with no core_props") - return set() - else: - df.p = df.p.str.rsplit('/', 1).str[-1] - return set(df.p) - - class JsonParser(object): references = [] qualifiers = [] @@ -3002,93 +2545,3 @@ def from_json(cls, jsn): if jsn['snaktype'] == 'novalue' or jsn['snaktype'] == 'somevalue': return cls(value=None, prop_nr=jsn['property'], snak_type=jsn['snaktype']) return cls(value=jsn['datavalue']['value']['id'], prop_nr=jsn['property']) - - -class MWApiError(Exception): - def __init__(self, error_message): - """ - Base class for Mediawiki API error handling - :param error_message: The error message returned by the Mediawiki API - :type error_message: A Python json representation dictionary of the error message - :return: - """ - self.error_msg = error_message - - def __str__(self): - return repr(self.error_msg) - - -class NonUniqueLabelDescriptionPairError(MWApiError): - def __init__(self, error_message): - """ - This class handles errors returned from the API due to an attempt to create an item which has the same - label and description as an existing item in a certain language. - :param error_message: An API error message containing 'wikibase-validator-label-with-description-conflict' - as the message name. - :type error_message: A Python json representation dictionary of the error message - :return: - """ - self.error_msg = error_message - - def get_language(self): - """ - :return: Returns a 2 letter language string, indicating the language which triggered the error - """ - return self.error_msg['error']['messages'][0]['parameters'][1] - - def get_conflicting_item_qid(self): - """ - :return: Returns the QID string of the item which has the same label and description as the one which should - be set. - """ - qid_string = self.error_msg['error']['messages'][0]['parameters'][2] - - return qid_string.split('|')[0][2:] - - def __str__(self): - return repr(self.error_msg) - - -class IDMissingError(Exception): - def __init__(self, value): - self.value = value - - def __str__(self): - return repr(self.value) - - -class SearchError(Exception): - def __init__(self, value): - self.value = value - - def __str__(self): - return repr(self.value) - - -class ManualInterventionReqException(Exception): - def __init__(self, value, property_string, item_list): - self.value = value + ' Property: {}, items affected: {}'.format(property_string, item_list) - - def __str__(self): - return repr(self.value) - - -class CorePropIntegrityException(Exception): - def __init__(self, value): - self.value = value - - def __str__(self): - return repr(self.value) - - -class MergeError(Exception): - def __init__(self, value): - self.value = value - - def __str__(self): - return repr(self.value) - - -class SearchOnlyError(Exception): - """Raised when the ItemEngine is in search_only mode""" - pass diff --git a/wikibaseintegrator/wbi_exceptions.py b/wikibaseintegrator/wbi_exceptions.py new file mode 100644 index 00000000..7cf3450d --- /dev/null +++ b/wikibaseintegrator/wbi_exceptions.py @@ -0,0 +1,88 @@ +class MWApiError(Exception): + def __init__(self, error_message): + """ + Base class for Mediawiki API error handling + :param error_message: The error message returned by the Mediawiki API + :type error_message: A Python json representation dictionary of the error message + :return: + """ + self.error_msg = error_message + + def __str__(self): + return repr(self.error_msg) + + +class NonUniqueLabelDescriptionPairError(MWApiError): + def __init__(self, error_message): + """ + This class handles errors returned from the API due to an attempt to create an item which has the same + label and description as an existing item in a certain language. + :param error_message: An API error message containing 'wikibase-validator-label-with-description-conflict' + as the message name. + :type error_message: A Python json representation dictionary of the error message + :return: + """ + self.error_msg = error_message + + def get_language(self): + """ + :return: Returns a 2 letter language string, indicating the language which triggered the error + """ + return self.error_msg['error']['messages'][0]['parameters'][1] + + def get_conflicting_item_qid(self): + """ + :return: Returns the QID string of the item which has the same label and description as the one which should + be set. + """ + qid_string = self.error_msg['error']['messages'][0]['parameters'][2] + + return qid_string.split('|')[0][2:] + + def __str__(self): + return repr(self.error_msg) + + +class IDMissingError(Exception): + def __init__(self, value): + self.value = value + + def __str__(self): + return repr(self.value) + + +class SearchError(Exception): + def __init__(self, value): + self.value = value + + def __str__(self): + return repr(self.value) + + +class ManualInterventionReqException(Exception): + def __init__(self, value, property_string, item_list): + self.value = value + ' Property: {}, items affected: {}'.format(property_string, item_list) + + def __str__(self): + return repr(self.value) + + +class CorePropIntegrityException(Exception): + def __init__(self, value): + self.value = value + + def __str__(self): + return repr(self.value) + + +class MergeError(Exception): + def __init__(self, value): + self.value = value + + def __str__(self): + return repr(self.value) + + +class SearchOnlyError(Exception): + """Raised when the ItemEngine is in search_only mode""" + pass diff --git a/wikibaseintegrator/wbi_fastrun.py b/wikibaseintegrator/wbi_fastrun.py index 9307d75e..23a222df 100644 --- a/wikibaseintegrator/wbi_fastrun.py +++ b/wikibaseintegrator/wbi_fastrun.py @@ -4,7 +4,7 @@ from functools import lru_cache from itertools import chain -from wikibaseintegrator import wbi_core +from wikibaseintegrator import wbi_functions from wikibaseintegrator.wbi_config import config @@ -463,7 +463,7 @@ def _query_data(self, prop_nr: str, use_units=False) -> None: if self.debug: print(query) - r = wbi_core.FunctionsEngine.execute_sparql_query(query, endpoint=self.sparql_endpoint_url)['results']['bindings'] + r = wbi_functions.execute_sparql_query(query, endpoint=self.sparql_endpoint_url)['results']['bindings'] count = int(r[0]['c']['value']) print("Count: {}".format(count)) num_pages = (int(count) // page_size) + 1 @@ -471,7 +471,7 @@ def _query_data(self, prop_nr: str, use_units=False) -> None: while True: # Query header query = ''' - #Tool: wbi_fastrun _query_data + #Tool: WikibaseIntegrator wbi_fastrun._query_data SELECT ?sid ?item ?v ?unit ?pq ?qval ?qunit ?ref ?pr ?rval WHERE {{ @@ -546,7 +546,7 @@ def _query_data(self, prop_nr: str, use_units=False) -> None: if self.debug: print(query) - results = wbi_core.FunctionsEngine.execute_sparql_query(query=query, endpoint=self.sparql_endpoint_url)['results']['bindings'] + results = wbi_functions.execute_sparql_query(query=query, endpoint=self.sparql_endpoint_url)['results']['bindings'] self.format_query_results(results, prop_nr) self.update_frc_from_query(results, prop_nr) page_count += 1 @@ -569,7 +569,7 @@ def _query_lang(self, lang: str, lang_data_type: str): } query = ''' - #Tool: wbi_fastrun _query_lang + #Tool: WikibaseIntegrator wbi_fastrun._query_lang SELECT ?item ?label WHERE {{ {base_filter} @@ -582,7 +582,7 @@ def _query_lang(self, lang: str, lang_data_type: str): if self.debug: print(query) - return wbi_core.FunctionsEngine.execute_sparql_query(query=query, endpoint=self.sparql_endpoint_url)['results']['bindings'] + return wbi_functions.execute_sparql_query(query=query, endpoint=self.sparql_endpoint_url)['results']['bindings'] @staticmethod def _process_lang(result: list): diff --git a/wikibaseintegrator/wbi_functions.py b/wikibaseintegrator/wbi_functions.py new file mode 100644 index 00000000..1c4d6c8f --- /dev/null +++ b/wikibaseintegrator/wbi_functions.py @@ -0,0 +1,394 @@ +import datetime +from time import sleep +from warnings import warn + +import requests + +from wikibaseintegrator import wbi_login +from wikibaseintegrator.wbi_backoff import wbi_backoff +from wikibaseintegrator.wbi_config import config +from wikibaseintegrator.wbi_exceptions import MWApiError, SearchError + + +def mediawiki_api_call(method, mediawiki_api_url=None, session=None, max_retries=1000, retry_after=60, **kwargs): + """ + :param method: 'GET' or 'POST' + :param mediawiki_api_url: + :param session: If a session is passed, it will be used. Otherwise a new requests session is created + :param max_retries: If api request fails due to rate limiting, maxlag, or readonly mode, retry up to + `max_retries` times + :type max_retries: int + :param retry_after: Number of seconds to wait before retrying request (see max_retries) + :type retry_after: int + :param kwargs: Passed to requests.request + :return: + """ + + mediawiki_api_url = config['MEDIAWIKI_API_URL'] if mediawiki_api_url is None else mediawiki_api_url + + # TODO: Add support for 'multipart/form-data' when using POST (https://www.mediawiki.org/wiki/API:Edit#Large_edits) + + if 'data' in kwargs and kwargs['data']: + if 'format' not in kwargs['data']: + kwargs['data'].update({'format': 'json'}) + elif kwargs['data']['format'] != 'json': + raise ValueError("'format' can only be 'json' when using mediawiki_api_call()") + + response = None + session = session if session else requests.session() + for n in range(max_retries): + try: + response = session.request(method, mediawiki_api_url, **kwargs) + except requests.exceptions.ConnectionError as e: + print("Connection error: {}. Sleeping for {} seconds.".format(e, retry_after)) + sleep(retry_after) + continue + if response.status_code == 503: + print("service unavailable. sleeping for {} seconds".format(retry_after)) + sleep(retry_after) + continue + + response.raise_for_status() + json_data = response.json() + """ + Mediawiki api response has code = 200 even if there are errors. + rate limit doesn't return HTTP 429 either. may in the future + https://phabricator.wikimedia.org/T172293 + """ + if 'error' in json_data: + # rate limiting + error_msg_names = set() + if 'messages' in json_data['error']: + error_msg_names = set(x.get('name') for x in json_data['error']['messages']) + if 'actionthrottledtext' in error_msg_names: + sleep_sec = int(response.headers.get('retry-after', retry_after)) + print("{}: rate limited. sleeping for {} seconds".format(datetime.datetime.utcnow(), sleep_sec)) + sleep(sleep_sec) + continue + + # maxlag + if 'code' in json_data['error'] and json_data['error']['code'] == 'maxlag': + sleep_sec = json_data['error'].get('lag', retry_after) + print("{}: maxlag. sleeping for {} seconds".format(datetime.datetime.utcnow(), sleep_sec)) + sleep(sleep_sec) + continue + + # readonly + if 'code' in json_data['error'] and json_data['error']['code'] == 'readonly': + print('The Wikibase instance is currently in readonly mode, waiting for {} seconds'.format(retry_after)) + sleep(retry_after) + continue + + # others case + raise MWApiError(response.json() if response else {}) + + # there is no error or waiting. break out of this loop and parse response + break + else: + # the first time I've ever used for - else!! + # else executes if the for loop completes normally. i.e. does not encouter a `break` + # in this case, that means it tried this api call 10 times + raise MWApiError(response.json() if response else {}) + + return json_data + + +def mediawiki_api_call_helper(data, login=None, mediawiki_api_url=None, user_agent=None, allow_anonymous=False, max_retries=1000, retry_after=60): + mediawiki_api_url = config['MEDIAWIKI_API_URL'] if mediawiki_api_url is None else mediawiki_api_url + user_agent = config['USER_AGENT_DEFAULT'] if user_agent is None else user_agent + + if not allow_anonymous: + if login is None: + # Force allow_anonymous as False by default to ask for a login object + raise ValueError("allow_anonymous can't be False and login is None at the same time.") + elif mediawiki_api_url != login.mediawiki_api_url: + raise ValueError("mediawiki_api_url can't be different with the one in the login object.") + + headers = { + 'User-Agent': user_agent + } + + if data is not None: + if login is not None and 'token' not in data: + data.update({'token': login.get_edit_token()}) + + if not allow_anonymous: + # Always assert user if allow_anonymous is False + if 'assert' not in data: + data.update({'assert': 'user'}) + if 'token' in data and data['token'] == '+\\': + raise wbi_login.LoginError("Anonymous edit are not allowed by default. Set allow_anonymous to True to edit mediawiki anonymously.") + elif 'assert' not in data: + # Always assert anon if allow_anonymous is True + data.update({'assert': 'anon'}) + + login_session = login.get_session() if login is not None else None + + return mediawiki_api_call('POST', mediawiki_api_url, login_session, data=data, headers=headers, max_retries=max_retries, retry_after=retry_after) + + +@wbi_backoff() +def execute_sparql_query(query, prefix=None, endpoint=None, user_agent=None, max_retries=1000, retry_after=60, debug=False): + """ + Static method which can be used to execute any SPARQL query + :param prefix: The URI prefixes required for an endpoint, default is the Wikidata specific prefixes + :param query: The actual SPARQL query string + :param endpoint: The URL string for the SPARQL endpoint. Default is the URL for the Wikidata SPARQL endpoint + :param user_agent: Set a user agent string for the HTTP header to let the Query Service know who you are. + :type user_agent: str + :param max_retries: The number time this function should retry in case of header reports. + :param retry_after: the number of seconds should wait upon receiving either an error code or the Query Service is not reachable. + :param debug: Enable debug output. + :type debug: boolean + :return: The results of the query are returned in JSON format + """ + + sparql_endpoint_url = config['SPARQL_ENDPOINT_URL'] if endpoint is None else endpoint + user_agent = config['USER_AGENT_DEFAULT'] if user_agent is None else user_agent + + if prefix: + query = prefix + '\n' + query + + params = { + 'query': '#Tool: WikibaseIntegrator wbi_functions.execute_sparql_query\n' + query, + 'format': 'json' + } + + headers = { + 'Accept': 'application/sparql-results+json', + 'User-Agent': user_agent + } + + if debug: + print(params['query']) + + for n in range(max_retries): + try: + response = requests.post(sparql_endpoint_url, params=params, headers=headers) + except requests.exceptions.ConnectionError as e: + print("Connection error: {}. Sleeping for {} seconds.".format(e, retry_after)) + sleep(retry_after) + continue + if response.status_code == 503: + print("Service unavailable (503). Sleeping for {} seconds".format(retry_after)) + sleep(retry_after) + continue + if response.status_code == 429: + if 'retry-after' in response.headers.keys(): + retry_after = response.headers['retry-after'] + print("Too Many Requests (429). Sleeping for {} seconds".format(retry_after)) + sleep(retry_after) + continue + response.raise_for_status() + results = response.json() + + return results + + +def merge_items(from_id, to_id, ignore_conflicts='', mediawiki_api_url=None, login=None, allow_anonymous=False, user_agent=None): + """ + A static method to merge two items + :param from_id: The QID which should be merged into another item + :type from_id: string with 'Q' prefix + :param to_id: The QID into which another item should be merged + :type to_id: string with 'Q' prefix + :param mediawiki_api_url: The MediaWiki url which should be used + :type mediawiki_api_url: str + :param ignore_conflicts: A string with the values 'description', 'statement' or 'sitelink', separated by a pipe ('|') if using more than one of those. + :type ignore_conflicts: str + :param login: The object containing the login credentials and cookies. An instance of wbi_login.Login. + :param allow_anonymous: Allow anonymous edit to the MediaWiki API. Disabled by default. + :type allow_anonymous: bool + :param user_agent: Set a user agent string for the HTTP header to let the Query Service know who you are. + :type user_agent: str + """ + + params = { + 'action': 'wbmergeitems', + 'fromid': from_id, + 'toid': to_id, + 'format': 'json', + 'bot': '', + 'ignoreconflicts': ignore_conflicts + } + + if config['MAXLAG'] > 0: + params.update({'maxlag': config['MAXLAG']}) + + return mediawiki_api_call_helper(data=params, login=login, mediawiki_api_url=mediawiki_api_url, user_agent=user_agent, allow_anonymous=allow_anonymous) + + +def remove_claims(claim_id, summary=None, revision=None, mediawiki_api_url=None, login=None, allow_anonymous=False, user_agent=None): + """ + Delete an item + :param claim_id: One GUID or several (pipe-separated) GUIDs identifying the claims to be removed. All claims must belong to the same entity. + :type claim_id: string + :param summary: Summary for the edit. Will be prepended by an automatically generated comment. + :type summary: str + :param revision: The numeric identifier for the revision to base the modification on. This is used for detecting conflicts during save. + :type revision: str + :param mediawiki_api_url: The MediaWiki url which should be used + :type mediawiki_api_url: str + :param login: The object containing the login credentials and cookies. An instance of wbi_login.Login. + :param allow_anonymous: Allow anonymous edit to the MediaWiki API. Disabled by default. + :type allow_anonymous: bool + :param user_agent: Set a user agent string for the HTTP header to let the Query Service know who you are. + :type user_agent: str + """ + + params = { + 'action': 'wbremoveclaims', + 'claim': claim_id, + 'summary': summary, + 'baserevid': revision, + 'bot': True, + 'format': 'json' + } + + if config['MAXLAG'] > 0: + params.update({'maxlag': config['MAXLAG']}) + + return mediawiki_api_call_helper(data=params, login=login, mediawiki_api_url=mediawiki_api_url, user_agent=user_agent, allow_anonymous=allow_anonymous) + + +def search_entities(search_string, language=None, strict_language=True, search_type='item', mediawiki_api_url=None, max_results=500, dict_result=False, login=None, + allow_anonymous=True, user_agent=None): + """ + Performs a search for entities in the Wikibase instance using labels and aliases. + :param search_string: a string which should be searched for in the Wikibase instance (labels and aliases) + :type search_string: str + :param language: The language in which to perform the search. + :type language: str + :param strict_language: Whether to disable language fallback + :type strict_language: bool + :param search_type: Search for this type of entity. One of the following values: form, item, lexeme, property, sense + :type search_type: str + :param mediawiki_api_url: Specify the mediawiki_api_url. + :type mediawiki_api_url: str + :param max_results: The maximum number of search results returned. Default 500 + :type max_results: int + :param dict_result: + :type dict_result: boolean + :param login: The object containing the login credentials and cookies. An instance of wbi_login.Login. + :param allow_anonymous: Allow anonymous edit to the MediaWiki API. Disabled by default. + :type allow_anonymous: bool + :param user_agent: The user agent string transmitted in the http header + :type user_agent: str + :return: list + """ + + language = config['DEFAULT_LANGUAGE'] if language is None else language + + params = { + 'action': 'wbsearchentities', + 'search': search_string, + 'language': language, + 'strict_language': strict_language, + 'type': search_type, + 'limit': 50, + 'format': 'json' + } + + cont_count = 0 + results = [] + + while True: + params.update({'continue': cont_count}) + + search_results = mediawiki_api_call_helper(data=params, login=login, mediawiki_api_url=mediawiki_api_url, user_agent=user_agent, + allow_anonymous=allow_anonymous) + + if search_results['success'] != 1: + raise SearchError('Wikibase API wbsearchentities failed') + else: + for i in search_results['search']: + if dict_result: + description = i['description'] if 'description' in i else None + aliases = i['aliases'] if 'aliases' in i else None + results.append({ + 'id': i['id'], + 'label': i['label'], + 'match': i['match'], + 'description': description, + 'aliases': aliases + }) + else: + results.append(i['id']) + + if 'search-continue' not in search_results: + break + else: + cont_count = search_results['search-continue'] + + if cont_count >= max_results: + break + + return results + + +def generate_item_instances(items, mediawiki_api_url=None, login=None, allow_anonymous=True, user_agent=None): + """ + A method which allows for retrieval of a list of Wikidata items or properties. The method generates a list of + tuples where the first value in the tuple is the QID or property ID, whereas the second is the new instance of + ItemEngine containing all the data of the item. This is most useful for mass retrieval of items. + :param user_agent: A custom user agent + :type user_agent: str + :param items: A list of QIDs or property IDs + :type items: list + :param mediawiki_api_url: The MediaWiki url which should be used + :type mediawiki_api_url: str + :return: A list of tuples, first value in the tuple is the QID or property ID string, second value is the instance of ItemEngine with the corresponding + item data. + :param login: The object containing the login credentials and cookies. An instance of wbi_login.Login. + :param allow_anonymous: Allow anonymous edit to the MediaWiki API. Disabled by default. + :type allow_anonymous: bool + """ + + assert type(items) == list + + from wikibaseintegrator.wbi_core import ItemEngine + + params = { + 'action': 'wbgetentities', + 'ids': '|'.join(items), + 'format': 'json' + } + + reply = mediawiki_api_call_helper(data=params, login=login, mediawiki_api_url=mediawiki_api_url, user_agent=user_agent, allow_anonymous=allow_anonymous) + + item_instances = [] + for qid, v in reply['entities'].items(): + ii = ItemEngine(item_id=qid, item_data=v) + ii.mediawiki_api_url = mediawiki_api_url + item_instances.append((qid, ii)) + + return item_instances + + +def get_distinct_value_props(sparql_endpoint_url=None, wikibase_url=None, property_constraint_pid=None, distinct_values_constraint_qid=None): + """ + On wikidata, the default core IDs will be the properties with a distinct values constraint select ?p where {?p wdt:P2302 wd:Q21502410} + See: https://www.wikidata.org/wiki/Help:Property_constraints_portal + https://www.wikidata.org/wiki/Help:Property_constraints_portal/Unique_value + """ + + wikibase_url = config['WIKIBASE_URL'] if wikibase_url is None else wikibase_url + property_constraint_pid = config['PROPERTY_CONSTRAINT_PID'] if property_constraint_pid is None else property_constraint_pid + distinct_values_constraint_qid = config['DISTINCT_VALUES_CONSTRAINT_QID'] if distinct_values_constraint_qid is None else distinct_values_constraint_qid + + pcpid = property_constraint_pid + dvcqid = distinct_values_constraint_qid + + query = ''' + SELECT ?p WHERE {{ + ?p <{wb_url}/prop/direct/{prop_nr}> <{wb_url}/entity/{entity}> + }} + '''.format(wb_url=wikibase_url, prop_nr=pcpid, entity=dvcqid) + results = execute_sparql_query(query, endpoint=sparql_endpoint_url)['results']['bindings'] + if not results: + warn("Warning: No distinct value properties found\n" + + "Please set P2302 and Q21502410 in your Wikibase or set `core_props` manually.\n" + + "Continuing with no core_props") + return set() + else: + return set(map(lambda x: x['p']['value'].rsplit('/', 1)[-1], results))