222 lines
7.5 KiB
PHP
222 lines
7.5 KiB
PHP
<?php
|
|
|
|
namespace php;
|
|
|
|
use Exception;
|
|
use Monolog\Logger;
|
|
|
|
|
|
/**
|
|
* Helper class for interacting with Wikipedia's API.
|
|
*/
|
|
class Mediawiki
|
|
{
|
|
/**
|
|
* The URL of Wikipedia's API endpoint.
|
|
*/
|
|
private const API_URL = "https://en.wikipedia.org/w/api.php?";
|
|
/**
|
|
* The user agent used to represent the death notifier to Wikipedia.
|
|
*/
|
|
private const USER_AGENT =
|
|
"death-notifier/%%VERSION_NUMBER%% " .
|
|
"(https://git.fwdekker.com/tools/death-notifier; florine@fwdekker.com)";
|
|
/**
|
|
* Number of categories to return per page per query.
|
|
*
|
|
* Since the record for a single page is 252 categories, setting to the maximum of 500 is more than sufficient.
|
|
*/
|
|
private const CATS_PER_QUERY = 500;
|
|
|
|
/**
|
|
* @var Logger The logger to use for logging.
|
|
*/
|
|
private Logger $logger;
|
|
|
|
|
|
/**
|
|
* Creates a new Mediawiki instance.
|
|
*
|
|
* @param Logger $logger the logger to use for logging
|
|
*/
|
|
public function __construct(Logger $logger)
|
|
{
|
|
$this->logger = $logger;
|
|
}
|
|
|
|
|
|
/**
|
|
* Sends a request to Wikipedia's API and returns its response as a JSON object.
|
|
*
|
|
* @param array<string, mixed> $url_param the query parameters to send to the API
|
|
* @return mixed a JSON object containing the API's response
|
|
* @throws Exception if the API could not be reached
|
|
*/
|
|
private function api_fetch(array $url_param): mixed
|
|
{
|
|
$ch = curl_init();
|
|
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
|
|
curl_setopt($ch, CURLOPT_URL, self::API_URL . http_build_query($url_param));
|
|
curl_setopt($ch, CURLOPT_USERAGENT, self::USER_AGENT);
|
|
|
|
$output = curl_exec($ch);
|
|
curl_close($ch);
|
|
if (is_bool($output) || curl_error($ch))
|
|
throw new Exception(curl_error($ch));
|
|
|
|
return json_decode($output, associative: true);
|
|
}
|
|
|
|
/**
|
|
* Sends a query request to the Wikipedia API in batches of 50 titles at a time.
|
|
*
|
|
* @param array<string, string> $params the parameters to include in each query
|
|
* @param string[] $titles the titles to query
|
|
* @return QueryOutput<mixed> the API's response
|
|
*/
|
|
private function api_query(array $params, array $titles): QueryOutput
|
|
{
|
|
$pages = [];
|
|
$redirects = array_combine($titles, $titles);
|
|
$missing = [];
|
|
|
|
for ($i = 0; $i < sizeof($titles); $i += 50) {
|
|
$iteration_params = array_merge(
|
|
[
|
|
"action" => "query",
|
|
"format" => "json",
|
|
"redirects" => true,
|
|
"titles" => implode("|", array_slice($titles, $i, 50))
|
|
],
|
|
$params
|
|
);
|
|
|
|
try {
|
|
$response = $this->api_fetch($iteration_params)["query"];
|
|
} catch (Exception $exception) {
|
|
$this->logger->error(
|
|
"Failed to query Wikipedia API.",
|
|
["cause" => $exception, "params" => $iteration_params]
|
|
);
|
|
|
|
http_response_code(500);
|
|
exit();
|
|
}
|
|
|
|
foreach ($response["pages"] as $page_id => $page) {
|
|
if ($page_id < 0)
|
|
$missing[] = strval($page["title"]);
|
|
else
|
|
$pages[strval($page["title"])] = $page;
|
|
}
|
|
|
|
$response_normalized = array_column($response["normalized"] ?? [], "to", "from");
|
|
foreach ($response_normalized as $from => $to)
|
|
$redirects[strval($from)] = strval($to);
|
|
|
|
$response_redirects = array_column($response["redirects"] ?? [], "to", "from");
|
|
foreach ($response_redirects as $from => $to) {
|
|
$pre_normalized = array_search($from, $response_normalized);
|
|
$redirects[strval($pre_normalized === false ? $from : $pre_normalized)] = strval($to);
|
|
}
|
|
}
|
|
|
|
return new QueryOutput($pages, $redirects, $missing);
|
|
}
|
|
|
|
/**
|
|
* Determines for each title whether the page at Wikipedia exists.
|
|
*
|
|
* @param string[] $titles the titles of the pages to check
|
|
* @return QueryOutput<string> a query output where the result is a flat array with the non-missing pages
|
|
*/
|
|
public function pages_exist(array $titles): QueryOutput
|
|
{
|
|
$output = $this->api_query(["prop" => "info"], $titles);
|
|
return new QueryOutput(array_fill_keys(array_keys($output->results), ""), $output->redirects, $output->missing);
|
|
}
|
|
|
|
/**
|
|
* Returns the person's status, or `null` if the title does not refer to a page about a person on Wikipedia.
|
|
*
|
|
* @param mixed $person_page the page as returned by the Wikipedia API
|
|
* @return PersonStatus|null the person's status, or `null` if the title does not refer to a page about a person on
|
|
* Wikipedia
|
|
*/
|
|
private function person_status(mixed $person_page): ?PersonStatus
|
|
{
|
|
if (array_key_exists("missing", $person_page) || array_key_exists("invalid", $person_page))
|
|
return null;
|
|
|
|
$category_titles = array_column($person_page["categories"], "title");
|
|
$deceased_regex = "/^Category:([0-9]{1,4}s? (BC |AD )?deaths|Year of death (missing|unknown))$/";
|
|
|
|
if (!empty(array_filter($category_titles, fn($it) => preg_match($deceased_regex, $it))))
|
|
return PersonStatus::Deceased;
|
|
elseif (in_array("Category:Possibly living people", $category_titles))
|
|
return PersonStatus::PossiblyAlive;
|
|
elseif (in_array("Category:Missing people", $category_titles))
|
|
return PersonStatus::Missing;
|
|
elseif (in_array("Category:Living people", $category_titles))
|
|
return PersonStatus::Alive;
|
|
else
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* Checks for each person what their status is according to Wikipedia's categorization.
|
|
*
|
|
* @param array<string> $people_names the names of the people to check aliveness of
|
|
* @return QueryOutput<PersonStatus|null> a query output with a response indicating for each person the status
|
|
*/
|
|
public function people_statuses(array $people_names): QueryOutput
|
|
{
|
|
$output = $this->api_query(["prop" => "categories", "cllimit" => strval(self::CATS_PER_QUERY)], $people_names);
|
|
|
|
$pages =
|
|
array_combine(
|
|
array_column($output->results, "title"),
|
|
array_map(fn($it) => $this->person_status($it), $output->results)
|
|
);
|
|
|
|
return new QueryOutput($pages, $output->redirects, $output->missing);
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* Output of a query request sent to Wikipedia's API.
|
|
*
|
|
* @template T the result type that is returned
|
|
*/
|
|
class QueryOutput
|
|
{
|
|
/**
|
|
* @var array<string, T> the results of the query, either raw from the API or processed in some way
|
|
*/
|
|
public readonly array $results;
|
|
/**
|
|
* @var array<string, string> mapping of queried names to normalized/redirected names
|
|
*/
|
|
public readonly array $redirects;
|
|
/**
|
|
* @var string[] list of missing pages
|
|
*/
|
|
public readonly array $missing;
|
|
|
|
|
|
/**
|
|
* Constructs a new query output.
|
|
*
|
|
* @param array<string, T> $results the results of the query, either raw from the API or processed in some way
|
|
* @param array<string, string> $redirects mapping of queried names to normalized/redirected names
|
|
* @param string[] $missing list of missing pages
|
|
*/
|
|
public function __construct(array $results, array $redirects, array $missing)
|
|
{
|
|
$this->results = $results;
|
|
$this->redirects = $redirects;
|
|
$this->missing = $missing;
|
|
}
|
|
}
|