death-notifier/src/main/php/Mediawiki.php

222 lines
7.5 KiB
PHP

<?php
namespace php;
use Exception;
use Monolog\Logger;
/**
* Helper class for interacting with Wikipedia's API.
*/
class Mediawiki
{
/**
* The URL of Wikipedia's API endpoint.
*/
private const API_URL = "https://en.wikipedia.org/w/api.php?";
/**
* The user agent used to represent the death notifier to Wikipedia.
*/
private const USER_AGENT =
"death-notifier/%%VERSION_NUMBER%% " .
"(https://git.fwdekker.com/tools/death-notifier; florine@fwdekker.com)";
/**
* Number of categories to return per page per query.
*
* Since the record for a single page is 252 categories, setting to the maximum of 500 is more than sufficient.
*/
private const CATS_PER_QUERY = 500;
/**
* @var Logger The logger to use for logging.
*/
private Logger $logger;
/**
* Creates a new Mediawiki instance.
*
* @param Logger $logger the logger to use for logging
*/
public function __construct(Logger $logger)
{
$this->logger = $logger;
}
/**
* Sends a request to Wikipedia's API and returns its response as a JSON object.
*
* @param array<string, mixed> $url_param the query parameters to send to the API
* @return mixed a JSON object containing the API's response
* @throws Exception if the API could not be reached
*/
private function api_fetch(array $url_param): mixed
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_URL, self::API_URL . http_build_query($url_param));
curl_setopt($ch, CURLOPT_USERAGENT, self::USER_AGENT);
$output = curl_exec($ch);
curl_close($ch);
if (is_bool($output) || curl_error($ch))
throw new Exception(curl_error($ch));
return json_decode($output, associative: true);
}
/**
* Sends a query request to the Wikipedia API in batches of 50 titles at a time.
*
* @param array<string, string> $params the parameters to include in each query
* @param string[] $titles the titles to query
* @return QueryOutput<mixed> the API's response
*/
private function api_query(array $params, array $titles): QueryOutput
{
$pages = [];
$redirects = array_combine($titles, $titles);
$missing = [];
for ($i = 0; $i < sizeof($titles); $i += 50) {
$iteration_params = array_merge(
[
"action" => "query",
"format" => "json",
"redirects" => true,
"titles" => implode("|", array_slice($titles, $i, 50))
],
$params
);
try {
$response = $this->api_fetch($iteration_params)["query"];
} catch (Exception $exception) {
$this->logger->error(
"Failed to query Wikipedia API.",
["cause" => $exception, "params" => $iteration_params]
);
http_response_code(500);
exit();
}
foreach ($response["pages"] as $page_id => $page) {
if ($page_id < 0)
$missing[] = strval($page["title"]);
else
$pages[strval($page["title"])] = $page;
}
$response_normalized = array_column($response["normalized"] ?? [], "to", "from");
foreach ($response_normalized as $from => $to)
$redirects[strval($from)] = strval($to);
$response_redirects = array_column($response["redirects"] ?? [], "to", "from");
foreach ($response_redirects as $from => $to) {
$pre_normalized = array_search($from, $response_normalized);
$redirects[strval($pre_normalized === false ? $from : $pre_normalized)] = strval($to);
}
}
return new QueryOutput($pages, $redirects, $missing);
}
/**
* Determines for each title whether the page at Wikipedia exists.
*
* @param string[] $titles the titles of the pages to check
* @return QueryOutput<string> a query output where the result is a flat array with the non-missing pages
*/
public function pages_exist(array $titles): QueryOutput
{
$output = $this->api_query(["prop" => "info"], $titles);
return new QueryOutput(array_fill_keys(array_keys($output->results), ""), $output->redirects, $output->missing);
}
/**
* Returns the person's status, or `null` if the title does not refer to a page about a person on Wikipedia.
*
* @param mixed $person_page the page as returned by the Wikipedia API
* @return PersonStatus|null the person's status, or `null` if the title does not refer to a page about a person on
* Wikipedia
*/
private function person_status(mixed $person_page): ?PersonStatus
{
if (array_key_exists("missing", $person_page) || array_key_exists("invalid", $person_page))
return null;
$category_titles = array_column($person_page["categories"], "title");
$deceased_regex = "/^Category:([0-9]{1,4}s? (BC |AD )?deaths|Year of death (missing|unknown))$/";
if (!empty(array_filter($category_titles, fn($it) => preg_match($deceased_regex, $it))))
return PersonStatus::Deceased;
elseif (in_array("Category:Possibly living people", $category_titles))
return PersonStatus::PossiblyAlive;
elseif (in_array("Category:Missing people", $category_titles))
return PersonStatus::Missing;
elseif (in_array("Category:Living people", $category_titles))
return PersonStatus::Alive;
else
return null;
}
/**
* Checks for each person what their status is according to Wikipedia's categorization.
*
* @param array<string> $people_names the names of the people to check aliveness of
* @return QueryOutput<PersonStatus|null> a query output with a response indicating for each person the status
*/
public function people_statuses(array $people_names): QueryOutput
{
$output = $this->api_query(["prop" => "categories", "cllimit" => strval(self::CATS_PER_QUERY)], $people_names);
$pages =
array_combine(
array_column($output->results, "title"),
array_map(fn($it) => $this->person_status($it), $output->results)
);
return new QueryOutput($pages, $output->redirects, $output->missing);
}
}
/**
* Output of a query request sent to Wikipedia's API.
*
* @template T the result type that is returned
*/
class QueryOutput
{
/**
* @var array<string, T> the results of the query, either raw from the API or processed in some way
*/
public readonly array $results;
/**
* @var array<string, string> mapping of queried names to normalized/redirected names
*/
public readonly array $redirects;
/**
* @var string[] list of missing pages
*/
public readonly array $missing;
/**
* Constructs a new query output.
*
* @param array<string, T> $results the results of the query, either raw from the API or processed in some way
* @param array<string, string> $redirects mapping of queried names to normalized/redirected names
* @param string[] $missing list of missing pages
*/
public function __construct(array $results, array $redirects, array $missing)
{
$this->results = $results;
$this->redirects = $redirects;
$this->missing = $missing;
}
}