321 lines
13 KiB
PHP
321 lines
13 KiB
PHP
<?php
|
|
|
|
namespace com\fwdekker\deathnotifier\wikipedia;
|
|
|
|
use com\fwdekker\deathnotifier\Config;
|
|
use com\fwdekker\deathnotifier\Util;
|
|
use JsonException;
|
|
|
|
|
|
/**
|
|
* Helper class for interacting with Wikipedia's API.
|
|
*/
|
|
class Wikipedia
|
|
{
|
|
/**
|
|
* The URL of Wikipedia's API endpoint.
|
|
*/
|
|
private const API_URL = "https://en.wikipedia.org/w/api.php?";
|
|
/**
|
|
* The user agent used to represent the death notifier to Wikipedia.
|
|
*/
|
|
private const USER_AGENT_FORMAT = "death-notifier/%%VERSION_NUMBER%% (%1\$s) %2\$s";
|
|
/**
|
|
* Number of articles to query per query.
|
|
*/
|
|
private const ARTICLES_PER_QUERY = 50;
|
|
/**
|
|
* Number of categories to return per article per query.
|
|
*
|
|
* Since the record for a single article is 252 categories, setting this to the maximum of 500 is more than
|
|
* sufficient.
|
|
*/
|
|
private const CATS_PER_QUERY = 500;
|
|
/**
|
|
* Number of article moves to follow of deleted articles before giving up.
|
|
*/
|
|
private const MAX_MOVE_DEPTH = 5;
|
|
|
|
|
|
/**
|
|
* Sends a request to Wikipedia's API and returns its response as a JSON object.
|
|
*
|
|
* @param array<string, mixed> $params the request parameters to send to the API
|
|
* @return mixed a JSON object containing the API's response
|
|
* @throws WikipediaException if the request fails
|
|
*/
|
|
private function api_fetch(array $params): mixed
|
|
{
|
|
$config = Config::get("wikipedia");
|
|
$user_agent =
|
|
sprintf(
|
|
self::USER_AGENT_FORMAT,
|
|
$config["user_agent_contact"],
|
|
"curl/" . curl_version()["version"],
|
|
);
|
|
|
|
$ch = curl_init();
|
|
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
|
|
curl_setopt($ch, CURLOPT_URL, self::API_URL . http_build_query($params));
|
|
curl_setopt($ch, CURLOPT_USERAGENT, $user_agent);
|
|
|
|
$output = curl_exec($ch);
|
|
curl_close($ch);
|
|
if (is_bool($output) || curl_error($ch))
|
|
throw new WikipediaException(curl_error($ch));
|
|
|
|
try {
|
|
return json_decode($output, associative: true, flags: JSON_THROW_ON_ERROR);
|
|
} catch (JsonException $exception) {
|
|
throw new WikipediaException($exception->getMessage(), previous: $exception);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Sends a query request to Wikipedia's API with continuation and returns its response as a JSON object.
|
|
*
|
|
* @param array<string, mixed> $params the query request parameters to send to the API
|
|
* @param string|null $continue_name the name of the continue parameter to follow, or `null` if no continuation
|
|
* should be done
|
|
* @return mixed[] a JSON array containing the API's responses merged into one array
|
|
* @throws WikipediaException if the query fails
|
|
*/
|
|
private function api_query_continued(array $params, ?string $continue_name = null): array
|
|
{
|
|
$query_params = array_merge(["action" => "query", "format" => "json"], $params);
|
|
|
|
$response = [];
|
|
$continue = null;
|
|
$continue_value = null;
|
|
do {
|
|
$continue_params = $continue === null
|
|
? $query_params
|
|
: array_merge($query_params, ["continue" => $continue, $continue_name => $continue_value]);
|
|
|
|
$new_response = $this->api_fetch($continue_params);
|
|
$response = Util::array_merge_recursive_distinct($response, $new_response);
|
|
|
|
if (isset($response["batchcomplete"])) {
|
|
$continue = null;
|
|
$continue_value = null;
|
|
} else if ($continue_name !== null) {
|
|
$continue = $response["continue"]["continue"];
|
|
$continue_value = $response["continue"][$continue_name];
|
|
}
|
|
} while ($continue !== null);
|
|
|
|
return $response;
|
|
}
|
|
|
|
/**
|
|
* Sends a query request to the Wikipedia API in batches of {@see Wikipedia::ARTICLES_PER_QUERY} titles at a time.
|
|
*
|
|
* @param array<string, mixed> $params the parameters to include in each query
|
|
* @param string[] $titles the titles of the pages to query
|
|
* @param string|null $continue_name the name of the continue parameter to follow for this request
|
|
* @return QueryOutput<mixed> the API's responses merged into a single `QueryOutput`
|
|
* @throws WikipediaException if the query fails
|
|
*/
|
|
private function api_query_batched(array $params, array $titles, ?string $continue_name = null): QueryOutput
|
|
{
|
|
$articles = [];
|
|
$redirects = new Redirects();
|
|
$missing = [];
|
|
|
|
$title_chunks = array_chunk($titles, self::ARTICLES_PER_QUERY);
|
|
foreach ($title_chunks as $title_chunk) {
|
|
$chunk_params = array_merge($params, ["titles" => implode("|", $title_chunk), "redirects" => true]);
|
|
$response = $this->api_query_continued($chunk_params, $continue_name)["query"];
|
|
|
|
foreach ($response["pages"] as $article_id => $article) {
|
|
if ($article_id < 0)
|
|
$missing[] = strval($article["title"]);
|
|
else
|
|
$articles[strval($article["title"])] = $article;
|
|
}
|
|
|
|
$new_normalizations =
|
|
array_combine(
|
|
array_map(fn($it) => strval($it), array_column($response["normalized"] ?? [], "from")),
|
|
array_map(fn($it) => strval($it), array_column($response["normalized"] ?? [], "to"))
|
|
);
|
|
$redirects = $redirects->add($new_normalizations);
|
|
|
|
$new_redirects =
|
|
array_combine(
|
|
array_map(fn($it) => strval($it), array_column($response["redirects"] ?? [], "from")),
|
|
array_map(fn($it) => strval($it), array_column($response["redirects"] ?? [], "to"))
|
|
);
|
|
$redirects = $redirects->add($new_redirects);
|
|
}
|
|
|
|
return new QueryOutput($articles, $redirects, $missing);
|
|
}
|
|
|
|
/**
|
|
* Figures out the page that a deleted page was moved to, if any.
|
|
*
|
|
* @param string $title the title to figure out the new title of after moving
|
|
* @param int $max_depth the maximum number of recursive steps to take, in case the article that {@see $title} was
|
|
* moved to was also deleted, and the article that was moved to was also deleted, etc.
|
|
* @return string|null the new title of the article, or `null` if the article has actually been deleted
|
|
* @throws WikipediaException if the query fails
|
|
*/
|
|
private function api_query_title_after_move(string $title, int $max_depth = Wikipedia::MAX_MOVE_DEPTH): ?string
|
|
{
|
|
if ($max_depth <= 0)
|
|
return null;
|
|
|
|
$log_events = $this->api_fetch([
|
|
"action" => "query",
|
|
"format" => "json",
|
|
"list" => "logevents",
|
|
"letype" => "move",
|
|
"letitle" => $title
|
|
])["query"]["logevents"];
|
|
if (empty($log_events))
|
|
return null;
|
|
|
|
$title_after_move = $log_events[0]["params"]["target_title"];
|
|
$after_move_page_info = $this->api_fetch([
|
|
"action" => "query",
|
|
"format" => "json",
|
|
"prop" => "info",
|
|
"titles" => $title_after_move,
|
|
"redirects" => true,
|
|
])["query"];
|
|
|
|
if (in_array(-1, array_keys($after_move_page_info["pages"])))
|
|
return $this->api_query_title_after_move($title_after_move, $max_depth - 1);
|
|
else if (!empty($after_move_page_info["redirects"]))
|
|
return $after_move_page_info["redirects"][0]["to"];
|
|
else
|
|
return $title_after_move;
|
|
}
|
|
|
|
/**
|
|
* Sends a query request to the Wikipedia API in batches of {@see Wikipedia::ARTICLES_PER_QUERY} titles at a time,
|
|
* and optionally resolves articles that were deleted because of a move.
|
|
*
|
|
* @param array<string, mixed> $params the parameters to include in each query
|
|
* @param string[] $titles the titles of the pages to query
|
|
* @param string|null $continue_name the name of the continue parameter to follow for this request
|
|
* @param bool $resolve_moves `true` if and only if deleted pages that were moved should be resolved
|
|
* @return QueryOutput<mixed> the API's responses merged into a single `QueryOutput`
|
|
* @throws WikipediaException if the query fails
|
|
*/
|
|
private function api_query_batched_resolve_moves(array $params, array $titles, ?string $continue_name = null,
|
|
bool $resolve_moves = true): QueryOutput
|
|
{
|
|
$output_base = $this->api_query_batched($params, $titles, $continue_name);
|
|
if (!$resolve_moves) return $output_base;
|
|
|
|
$not_moved = [];
|
|
$moves = [];
|
|
foreach ($output_base->missing as $missing_title) {
|
|
$title_after_move = $this->api_query_title_after_move($missing_title);
|
|
if ($title_after_move === null)
|
|
$not_moved[] = $missing_title;
|
|
else
|
|
$moves[$missing_title] = $title_after_move;
|
|
}
|
|
|
|
$output_of_moves = $this->api_query_batched($params, array_values($moves), $continue_name);
|
|
if ($output_of_moves->redirects->has_non_reflexive_redirects())
|
|
throw new WikipediaException("Article was moved unexpectedly: " . json_encode($output_of_moves->redirects));
|
|
if (!empty($output_of_moves->missing))
|
|
throw new WikipediaException("Article missing unexpectedly: " . json_encode($output_of_moves->missing));
|
|
|
|
return new QueryOutput(
|
|
array_replace($output_base->results, $output_of_moves->results),
|
|
$output_base->redirects->add($moves),
|
|
$not_moved
|
|
);
|
|
}
|
|
|
|
|
|
/**
|
|
* Returns the current {@see ArticleType} of the article.
|
|
*
|
|
* @param mixed $article the article object as returned by the Wikipedia API
|
|
* @return ArticleType the current `ArticleType` of article
|
|
*/
|
|
private function article_type(mixed $article): ArticleType
|
|
{
|
|
$category_titles = array_column($article["categories"], "title");
|
|
|
|
$status = $this->person_status($article);
|
|
if ($status !== null)
|
|
return ArticleType::Person;
|
|
else if (in_array("Category:All set index articles", $category_titles) ||
|
|
in_array("Category:All disambiguation pages", $category_titles))
|
|
return ArticleType::Disambiguation;
|
|
else
|
|
return ArticleType::Other;
|
|
}
|
|
|
|
/**
|
|
* Returns the current {@see PersonStatus}, or `null` if the title does not refer to an article about a person on
|
|
* Wikipedia.
|
|
*
|
|
* @param mixed $article the article as returned by the Wikipedia API
|
|
* @return PersonStatus|null the current `PersonStatus`, or `null` if the title does not refer to an article about a
|
|
* person on Wikipedia
|
|
*/
|
|
private function person_status(mixed $article): ?PersonStatus
|
|
{
|
|
if (array_key_exists("missing", $article) || array_key_exists("invalid", $article))
|
|
return null;
|
|
|
|
$category_titles = array_column($article["categories"], "title");
|
|
$dead_regex = "/^Category:([0-9]{1,4}s? (BC |AD )?(deaths|suicides)|Year of death (missing|unknown))$/";
|
|
|
|
if (!empty(array_filter($category_titles, fn($it) => preg_match($dead_regex, $it))))
|
|
return PersonStatus::Dead;
|
|
elseif (in_array("Category:Possibly living people", $category_titles))
|
|
return PersonStatus::PossiblyAlive;
|
|
elseif (in_array("Category:Missing people", $category_titles))
|
|
return PersonStatus::Missing;
|
|
elseif (in_array("Category:Living people", $category_titles))
|
|
return PersonStatus::Alive;
|
|
else
|
|
return null;
|
|
}
|
|
|
|
|
|
/**
|
|
* Checks for all {@see $names} what their current {@see ArticleType} and {@see PersonStatus} is according to
|
|
* Wikipedia.
|
|
*
|
|
* @param string[] $names the names of the people to retrieve the information of
|
|
* @param bool $resolve_moves `true` if and only if deleted pages that were moved should be resolved
|
|
* @return QueryOutput<array{"type": ArticleType, "status": PersonStatus|null}> a `QueryOutput` with its
|
|
* {@see QueryOutput::$results} mapping each normalized title to its `ArticleType` and, if the type is `Person`, the
|
|
* `PersonStatus`
|
|
* @throws WikipediaException if the query fails
|
|
*/
|
|
public function query_people_info(array $names, bool $resolve_moves): QueryOutput
|
|
{
|
|
$output = $this->api_query_batched_resolve_moves(
|
|
params: ["prop" => "categories", "cllimit" => strval(self::CATS_PER_QUERY)],
|
|
titles: $names,
|
|
continue_name: "clcontinue",
|
|
resolve_moves: $resolve_moves
|
|
);
|
|
|
|
$articles =
|
|
array_combine(
|
|
array_column($output->results, "title"),
|
|
array_map(
|
|
fn($it) => [
|
|
"type" => $this->article_type($it),
|
|
"status" => $this->person_status($it),
|
|
],
|
|
$output->results
|
|
)
|
|
);
|
|
|
|
return new QueryOutput($articles, $output->redirects, $output->missing);
|
|
}
|
|
}
|