death-notifier/src/main/php/com/fwdekker/deathnotifier/mediawiki/MediaWiki.php

265 lines
10 KiB
PHP

<?php
namespace com\fwdekker\deathnotifier\mediawiki;
use Monolog\Logger;
/**
* Helper class for interacting with Wikipedia's API.
*/
class MediaWiki
{
/**
* The URL of Wikipedia's API endpoint.
*/
private const API_URL = "https://en.wikipedia.org/w/api.php?";
/**
* The user agent used to represent the death notifier to Wikipedia.
*/
private const USER_AGENT =
"death-notifier/%%VERSION_NUMBER%% " .
"(https://git.fwdekker.com/tools/death-notifier; florine@fwdekker.com)";
/**
* Number of articles to query per query.
*/
private const ARTICLES_PER_QUERY = 50;
/**
* Number of categories to return per article per query.
*
* Since the record for a single article is 252 categories, setting this to the maximum of 500 is more than
* sufficient.
*/
private const CATS_PER_QUERY = 500;
/**
* @var Logger the logger to use for logging
*/
private Logger $logger; // @phpstan-ignore-line Unused, but useful for debugging
/**
* Creates a new Mediawiki instance.
*
* @param Logger $logger the logger to use for logging
*/
public function __construct(Logger $logger)
{
$this->logger = $logger;
}
/**
* Sends a request to Wikipedia's API and returns its response as a JSON object.
*
* @param array<string, mixed> $params the request parameters to send to the API
* @return mixed a JSON object containing the API's response
* @throws MediaWikiException() if the request fails
*/
private function api_fetch(array $params): mixed
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_URL, self::API_URL . http_build_query($params));
curl_setopt($ch, CURLOPT_USERAGENT, self::USER_AGENT);
$output = curl_exec($ch);
curl_close($ch);
if (is_bool($output) || curl_error($ch))
throw new MediaWikiException(curl_error($ch));
return json_decode($output, associative: true);
}
/**
* Queries Wikipedia's API with continuation and returns its response as a JSON object.
*
* @param array<string, mixed> $params the query parameters to send to the API
* @param string|null $continue_name the name of the continue parameter to follow, or `null` if no continuation
* should be done
* @return mixed[] the query's value of the `query` key as a JSON object
* @throws MediaWikiException if the query fails
*/
private function api_query_continued(array $params, ?string $continue_name = null): array
{
$query_params = array_merge(["action" => "query", "format" => "json"], $params);
$response = [];
$continue = null;
$continue_value = null;
do {
$continue_params = $continue === null
? $query_params
: array_merge($query_params, ["continue" => $continue, $continue_name => $continue_value]);
$new_response = $this->api_fetch($continue_params);
$response = array_merge_recursive_distinct($response, $new_response);
if (isset($response["batchcomplete"])) {
$continue = null;
$continue_value = null;
} else if ($continue_name !== null) {
$continue = $response["continue"]["continue"];
$continue_value = $response["continue"][$continue_name];
}
} while ($continue !== null);
return $response;
}
/**
* Sends a query request to the Wikipedia API in batches of `ARTICLES_PER_QUERY` titles at a time.
*
* @param array<string, string> $params the parameters to include in each query
* @param string[] $titles the titles to query
* @param string|null $continue_name the name of the continue parameter used for this request by the API
* @return QueryOutput<mixed> the API's response
* @throws MediaWikiException if the query fails
* @noinspection PhpSameParameterValueInspection `$continue_name` may take other values in the future
*/
private function api_query_batched(array $params, array $titles, ?string $continue_name): QueryOutput
{
$articles = [];
$redirects = array_combine($titles, $titles);
$missing = [];
$title_chunks = array_chunk($titles, self::ARTICLES_PER_QUERY);
foreach ($title_chunks as $title_chunk) {
$chunk_params = array_merge($params, ["titles" => implode("|", $title_chunk), "redirects" => true]);
$response = $this->api_query_continued($chunk_params, $continue_name)["query"];
foreach ($response["pages"] as $article_id => $article) {
if ($article_id < 0)
$missing[] = strval($article["title"]);
else
$articles[strval($article["title"])] = $article;
}
$response_normalized = array_column($response["normalized"] ?? [], "to", "from");
foreach ($response_normalized as $from => $to)
$redirects[strval($from)] = strval($to);
$response_redirects = array_column($response["redirects"] ?? [], "to", "from");
foreach ($response_redirects as $from => $to) {
$pre_normalized = array_search($from, $response_normalized);
$redirects[strval($pre_normalized === false ? $from : $pre_normalized)] = strval($to);
}
}
return new QueryOutput($articles, $redirects, $missing);
}
/**
* Returns the person's status, or `null` if the title does not refer to an article about a person on Wikipedia.
*
* @param mixed $article the article object as returned by the Wikipedia API
* @return PersonStatus|null the person's status, or `null` if the title does not refer to an article about a person
* on Wikipedia
*/
private function person_status(mixed $article): ?PersonStatus
{
if (array_key_exists("missing", $article) || array_key_exists("invalid", $article))
return null;
$category_titles = array_column($article["categories"], "title");
$dead_regex = "/^Category:([0-9]{1,4}s? (BC |AD )?deaths|Year of death (missing|unknown))$/";
if (!empty(array_filter($category_titles, fn($it) => preg_match($dead_regex, $it))))
return PersonStatus::Dead;
elseif (in_array("Category:Possibly living people", $category_titles))
return PersonStatus::PossiblyAlive;
elseif (in_array("Category:Missing people", $category_titles))
return PersonStatus::Missing;
elseif (in_array("Category:Living people", $category_titles))
return PersonStatus::Alive;
else
return null;
}
/**
* Returns the type of the article.
*
* @param mixed $article the article object as returned by the Wikipedia API
* @return ArticleType the type of article
*/
private function article_type(mixed $article): ArticleType
{
$category_titles = array_column($article["categories"], "title");
$status = $this->person_status($article);
if ($status !== null)
return ArticleType::Person;
else if (in_array("Category:All set index articles", $category_titles) ||
in_array("Category:All disambiguation pages", $category_titles))
return ArticleType::Disambiguation;
else
return ArticleType::Other;
}
/**
* Checks for each person what their status is according to Wikipedia's categorization.
*
* @param string[] $names the names of the people to check aliveness of
* @return QueryOutput<array{"type": ArticleType, "status": PersonStatus|null}> a query output with results mapping
* each article's normalized title to the article's type and, if the article is about a person, the person's status
* @throws MediaWikiException if the query fails
*/
public function query_person_info(array $names): QueryOutput
{
$output = $this->api_query_batched(
params: ["prop" => "categories", "cllimit" => strval(self::CATS_PER_QUERY)],
titles: $names,
continue_name: "clcontinue"
);
$articles =
array_combine(
array_column($output->results, "title"),
array_map(
fn($it) => [
"type" => $this->article_type($it),
"status" => $this->person_status($it),
],
$output->results
)
);
return new QueryOutput($articles, $output->redirects, $output->missing);
}
}
/**
* Recursively merges arrays, while overwriting other types.
*
* Functions similar to `array_merge_recursive`, except that if two values are encountered at least one of which is not
* an array, the value of `array2` is taken, instead of taking an array of both values.
*
* If a key exists in `array1` but not in `array2`, then the value of `array1` is used. If a key exists in `array2` but
* not in `array1`, then the value of `array2` is used. If a key exists in both `array1` and `array2`, and both values
* are arrays, this function is applied recursively, effectively using a merged array containing the values of both
* arrays' arrays. If a key exists in both `array1` and `array2`, and at least one of the values is not an array, the
* value of `array2` is used.
*
* Taken from `https://www.php.net/manual/en/function.array-merge-recursive.php#92195`.
*
* @param mixed[] $array1 the base array to merge into
* @param mixed[] $array2 the array to merge into `array1`
* @return mixed[] the recursively merged array
* @author Daniel <daniel (at) danielsmedegaardbuus (dot) dk>
* @author Gabriel Sobrinho <gabriel (dot) sobrinho (at) gmail (dot) com>
*/
function array_merge_recursive_distinct(array $array1, array $array2): array
{
$merged = $array1;
foreach ($array2 as $key => $value)
if (is_array($value) && isset ($merged[$key]) && is_array($merged[$key]))
$merged[$key] = array_merge_recursive_distinct($merged[$key], $value);
else
$merged[$key] = $value;
return $merged;
}