interlanguage-checker/src/main/js/MediaWiki.ts

797 lines
25 KiB
TypeScript

import {couldNotConnectMessage, mergeStates} from "./Shared";
/**
* A data class for combining a language and page title to identify a page.
*
* This is only an _identifier_ of a page, not the page itself. For information on the page such as the links it
* contains, whether it's a redirect, etc., see the `Page` class.
*/
export class InterlangLink {
/**
* The language of the wiki this page is of.
*/
readonly lang: string;
/**
* The title of the page.
*/
readonly title: string;
/**
* Constructs a new interlanguage link.
*
* @param lang the language of the wiki this page is of
* @param title the title of the page
*/
constructor(lang: string, title: string) {
this.lang = lang;
this.title = title;
}
/**
* Returns `true` if and only if the given object equals this `InterlangLink`.
*
* @param other the object to compare to this `InterlangLink`
* @return `true` if and only if the given object equals this `InterlangLink`
*/
equals(other: any): boolean {
return other instanceof InterlangLink && this.lang === other.lang && this.title === other.title;
}
/**
* Returns `true` if and only if the given object equals this `InterlangLink`, ignoring the case of the titles.
*
* @param other the object to compare to this `InterlangLink`
* @return `true` if and only if the given object equals this `InterlangLink`, ignoring the case of the titles
*/
equalsIgnoringCase(other: any) {
return other instanceof InterlangLink && this.lang === other.lang
&& this.title.toLowerCase() === other.title.toLowerCase();
}
/**
* Converts this `InterlangLink` to a string.
*
* @return the string representation of this `InterlangLink`
*/
toString(): string {
return `${this.lang}:${this.title}`;
}
/**
* Returns a deep copy of this `InterlangLink`.
*
* @return the deep copy
*/
copy(): InterlangLink {
return new InterlangLink(this.lang, this.title);
}
}
/**
* Redirects one `InterlangLink` to another.
*/
export class Redirect {
/**
* The page that redirects.
*/
readonly from: InterlangLink;
/**
* The page that is redirected to.
*/
readonly to: InterlangLink;
/**
* Constructs a new `Redirect`.
*
* @param from the page that redirects
* @param to the page that is redirected to
*/
constructor(from: InterlangLink, to: InterlangLink) {
this.from = from.copy();
this.to = to.copy();
}
/**
* Returns `true` if and only if the given object equals this `Redirect`.
*
* @param other the object to compare to this `Redirect`
* @return `true` if and only if the given object equals this `Redirect`
*/
equals(other: any): boolean {
return other instanceof Redirect && this.from.equals(other.from) && this.to.equals(other.to);
}
/**
* Returns a deep copy of this `Redirect`.
*
* This is a deep copy because the constructor performs copies of the received variables.
*
* @return the deep copy
*/
copy(): Redirect {
return new Redirect(this.from, this.to);
}
}
/**
* A map of interwiki links.
*
* Not implemented as a map but as a list of objects. Therefore, when there are duplicate keys, the original value is
* always retained.
*/
// TODO: Replace entire class with a `Map`
export class InterwikiMap {
/**
* The mapping from interwiki abbreviations/prefixes to URLs.
*/
readonly map: Map<string, string>;
/**
* Constructs a new interwiki map.
*
* @param map the mapping from interwiki abbreviations/prefixes to URLs
*/
constructor(map: { prefix: string, url: string }[]) {
this.map = new Map();
map.forEach(({prefix, url}) => this.map.set(prefix, url.replace("http://", "https://")));
}
/**
* Constructs a new interwiki map from the given map.
*
* @param map the map to construct an interwiki map from
*/
static fromMap(map: Map<string, string>): InterwikiMap {
return new InterwikiMap([...map.entries()].map(it => ({prefix: it[0], url: it[1]})));
}
/**
* Returns the URL for the given prefix, or `undefined` if the prefix could not be found.
*
* @param prefix the prefix to return the URL of
* @return the URL for the given prefix, or `undefined` if the prefix could not be found
*/
getUrl(prefix: string): string | undefined {
return this.map.get(prefix);
}
/**
* Returns `true` if and only if this map has a URL for the given prefix.
*
* @param prefix the prefix to check for
* @return `true` if and only if this map has a URL for the given prefix
*/
hasUrl(prefix: string): boolean {
return this.map.has(prefix);
}
/**
* Returns a deep copy of this `InterwikiMap`.
*
* This is a deep copy because the constructor performs copies of the received variables.
*
* @return the deep copy
*/
copy(): InterwikiMap {
return InterwikiMap.fromMap(this.map);
}
}
/**
* Describes a page, i.e. what you get if you follow an `InterlangLink`.
*/
export class Page {
/**
* The full URL at which this page is located.
*/
readonly url: URL;
/**
* The interlanguage link describing the location of the page.
*/
readonly link: InterlangLink;
/**
* The interlanguage links contained in this page.
*/
readonly langLinks: InterlangLink[];
/**
* `true` if and only if this page exists.
*/
readonly exists: boolean;
/**
* Constructs a new page.
*
* @param url the full URL at which this page is located
* @param link the interlanguage link describing the location of the page
* @param langLinks the interlanguage links contained in this page
* @param exists `true` if and only if this page exists
*/
constructor(url: URL, link: InterlangLink, langLinks: InterlangLink[], exists: boolean) {
this.url = new URL(url.toString());
this.link = link.copy();
this.langLinks = langLinks.map(it => it.copy());
this.exists = exists;
}
/**
* Returns `true` if and only if this page's language links are sorted alphabetically.
*
* @return `true` if and only if this page's language links are sorted alphabetically
*/
langLinksAreOrdered(): boolean {
return this.langLinks.reduce(
(isSorted: boolean, langLink: InterlangLink, i: number, self: InterlangLink[]) =>
i === 0 || (isSorted && self[i - 1].toString().localeCompare(langLink.toString()) <= 0),
true
);
}
/**
* Returns `true` if and only if this page has multiple links to the same language.
*
* @return `true` if and only if this page has multiple links to the same language
*/
hasDoubleLinks(): boolean {
return this.langLinks.some(a => this.langLinks.filter(b => a.lang === b.lang).length > 1);
}
/**
* Returns a deep copy of this `Page`.
*
* This is a deep copy because the constructor performs copies of the received variables.
*
* @return the deep copy
*/
copy() {
return new Page(this.url, this.link, this.langLinks, this.exists);
}
}
/**
* A network of pages linking to each other.y
*/
export class InterlangNetwork {
/**
* The alphabetically-sorted pages that have been discovered in the network.
*/
readonly pages: Page[];
/**
* The redirects that have been discovered in the network.
*/
readonly redirects: Redirect[];
/**
* Constructs a new `InterlangNetwork`.
*
* @param pages the pages linking to each other
* @param redirects the redirects in the network
*/
constructor(pages: Page[], redirects: Redirect[]) {
this.pages = pages
.map(it => it.copy())
.sort((a, b) => a.link.toString().localeCompare(b.link.toString()));
this.redirects = redirects.map(it => it.copy());
}
/**
* Determines whether the given source links to the given destination, potentially through a redirect.
*
* @param source the source page of which to check the links
* @param destination the destination that could be linked to
* @return the checker's verdict of the link
*/
getLinkVerdict(source: Page, destination: Page): LinkVerdict {
const isSelfLangLink = source.link.lang === destination.link.lang;
if (source.langLinks.some(it => it.equals(destination.link)))
return isSelfLangLink ? "self-linked" : "linked";
if (source.langLinks.some(it => it.equalsIgnoringCase(destination.link)))
return isSelfLangLink ? "self-linked" : "wrongly-cased";
if (source.langLinks.some(link => this.redirects.some(it => it.equals(new Redirect(link, destination.link)))))
return isSelfLangLink ? "self-linked" : "redirected";
return isSelfLangLink ? "self-unlinked" : "unlinked";
}
/**
* Analyzes the given source page and returns a verdict of its own state and of the state of its link to all other
* pages in this network.
*
* @param srcPage the page to give a verdict of
* @return the checker's verdicts of the page and its outgoing links
*/
getPageVerdict(srcPage: Page): { self: PageVerdict[], pages: { page: Page, verdict: LinkVerdict }[] } {
const pageStates = this.pages.map(dstPage => ({page: dstPage, verdict: this.getLinkVerdict(srcPage, dstPage)}));
let selfStates: PageVerdict[] = [];
if (!srcPage.exists)
selfStates.push("not-found");
if (!srcPage.langLinksAreOrdered())
selfStates.push("wrongly-ordered");
if (srcPage.hasDoubleLinks())
selfStates.push("doubly-linked");
if (pageStates.some(({verdict}) => verdict === "self-linked"))
selfStates.push("self-linked");
if (pageStates.some(({verdict}) => verdict === "unlinked"))
selfStates.push("unlinked");
if (pageStates.some(({verdict}) => verdict === "redirected"))
selfStates.push("redirected");
if (pageStates.some(({verdict}) => verdict === "wrongly-cased"))
selfStates.push("wrongly-cased");
if (selfStates.length === 0)
selfStates.push("perfect");
return {self: selfStates, pages: pageStates};
}
/**
* Returns a verdict on the network.
*
* @return a verdict on the network
*/
getNetworkVerdict(): NetworkVerdict {
const states: NetworkVerdict[] = ["broken", "flawed", "perfect"];
return this.pages.reduce(
(state: NetworkVerdict, page: Page) => {
const verdict = this.getPageVerdict(page).self;
if (verdict.some(it => ["not-found", "unlinked"].includes(it)))
return mergeStates<NetworkVerdict>(states, state, "broken");
if (verdict.some(it => ["wrongly-ordered", "doubly-linked", "self-linked", "redirected", "wrongly-cased"].includes(it)))
return mergeStates<NetworkVerdict>(states, state, "flawed");
return mergeStates<NetworkVerdict>(states, state, "perfect");
},
"perfect"
);
}
/**
* Returns a deep copy of this `InterlangNetwork`.
*
* This is a deep copy because the constructor performs copies of the received variables.
*
* @return the deep copy
*/
copy(): InterlangNetwork {
return new InterlangNetwork(this.pages, this.redirects);
}
}
/**
* Interacts with the API in an asynchronous manner.
*/
export class MediaWiki {
/**
* The origin of the wiki's API URL.
*/
readonly origin: string;
/**
* The path relative to the wiki's API; starts with a `/`.
*/
readonly apiPath: string;
/**
* The general information, retrieved from the API.
*/
general!: { articlepath: string, lang: string };
/**
* The interwiki map of this wiki.
*/
interwikiMap!: InterwikiMap;
/**
* The namespaces on this wiki.
*/
namespaces!: Map<number, { id: string, canonical: string, "*": string }>;
/**
* Constructs a new MediaWiki object.
*
* The `#init` method **must** be called before invoking any other function. Behavior is undefined otherwise.
*
* @param apiUrl the url to the `api.php` file
*/
constructor(apiUrl: string) {
const urlObj = new URL(apiUrl);
this.origin = urlObj.origin;
this.apiPath = urlObj.pathname;
}
/**
* Initializes this `MediaWiki` object with the necessary information from the API.
*
* @return this `MediaWiki` object
*/
async init(): Promise<MediaWiki> {
const query = await this.getSiteInfo("general", "interwikimap", "namespaces");
// Add self to map
query.interwikimap.push({prefix: query.general.lang, url: query.general.server + query.general.articlepath});
// Set fields
this.general = query.general;
this.interwikiMap = new InterwikiMap(query.interwikimap);
this.namespaces = query.namespaces;
return this;
}
/**
* Sends a request to the MediaWiki API and runs the given callback on the response.
*
* @param params the parameters to send to the API
* @return the API's response
*/
request(params: { [key: string]: string }): Promise<any> {
const url = this.origin + this.apiPath + "?format=json&origin=*&" + new URLSearchParams(params).toString();
console.debug(`Requesting from ${this.origin}${this.apiPath} with params`, params, "at", url);
return fetch(url)
.then(response => {
if (!response.ok) throw new Error(couldNotConnectMessage);
return response.json();
})
.catch(() => {
throw new Error(couldNotConnectMessage);
});
}
/**
* Requests all language links on the given article.
*
* @param title the title of the article to return links of
* @return result the query result, or `undefined` if the article could not be found
*/
getLangLinks(title: string): Promise<{ link: InterlangLink, langLinks: InterlangLink[], redirects: Redirect[] } | undefined> {
return this
.request({action: "parse", page: title, prop: "langlinks", redirects: ""})
.then(response => {
if (response.error !== undefined)
return undefined;
const langLinks = response.parse.langlinks
.map((it: { lang: string, "*": string }) => new InterlangLink(it.lang, it["*"]));
const redirects = response.parse.redirects
.map((it: { from: string; to: string; }) => new Redirect(this.toLink(it.from), this.toLink(it.to)))
.reduce((redirects: Redirect[], redirect: Redirect, _: number, self: Redirect[]) => {
// TODO Support triple redirects (#30)
const matches = self.filter(it => it.from.equals(redirect.to));
if (matches.length > 1)
redirects.push(new Redirect(redirect.from, matches[0].to));
else
redirects.push(redirect);
return redirects;
}, []);
return {link: this.toLink(response.parse.title), langLinks: langLinks, redirects: redirects};
});
}
/**
* Returns this wiki's site information.
*
* @param props the site information properties to retrieve, such as "general" or "interwikimap"
* @return the wiki's site information, with each property corresponding to an argument to this method
*/
getSiteInfo(...props: string[]): any {
return this.request({action: "query", meta: "siteinfo", siprop: props.join("|")})
.then(response => response.query);
}
/**
* Normalizes the given link, adjusting its language to this wiki's language and replacing the link's namespace with
* the canonical namespace.
*
* @param link the link to normalize
* @return the normalized link
*/
normalize(link: InterlangLink): InterlangLink {
const normalLang = this.general.lang;
const titleParts = link.title.split(":");
if (titleParts.length < 2) return new InterlangLink(normalLang, link.title);
titleParts[0] = [...this.namespaces.values()].reduce(
(titlePart: string, namespace: { id: string, canonical: string, "*": string }) => {
return titlePart === namespace["canonical"] ? namespace["*"] : titlePart;
},
titleParts[0]
);
const normalTitle = titleParts.join(":");
return new InterlangLink(normalLang, normalTitle);
}
/**
* Shorthand for converting a title to an `InterlangLink` of this wiki's language.
*
* @param title the title of the article to generate a link for
* @return the link to the article on this wiki
* @private
*/
private toLink(title: string): InterlangLink {
return new InterlangLink(this.general.lang, title);
}
}
/**
* Manages a `MediaWiki` instance for different languages, caching retrieved information for re-use.
*/
export class MediaWikiManager {
/**
* The combined interwiki map of all `MediaWiki` instances under management of this manager.
*
* @private
*/
private iwMap: InterwikiMap;
/**
* The cached `MediaWiki` instances
*/
mws: Map<string, MediaWiki>;
/**
* The language of the base `MediaWiki`, where the exploration starts.
*/
baseLang!: string;
/**
* The path to articles, where `$1` indicates the article name.
*/
articlePath!: string;
/**
* The path to `api.php`.
*/
apiPath!: string;
/**
* Constructs a new MediaWiki manager.
*
* The `#init` method **must** be called before invoking any other function. Behavior is undefined otherwise.
*/
constructor() {
this.mws = new Map();
this.iwMap = new InterwikiMap([]);
}
/**
* Initializes this `MediaWikiManager`.
*
* @param baseMw the `MediaWiki` that is used as a starting point
* @return this `MediaWikiManager`
*/
async init(baseMw: MediaWiki): Promise<MediaWikiManager> {
const basePath = [...(baseMw.apiPath)]
.map((it, i) => it === baseMw.general.articlepath[i] ? it : "")
.join("")
.slice(0, -1);
this.articlePath = baseMw.general.articlepath.slice(basePath.length);
this.apiPath = baseMw.apiPath.slice(basePath.length);
this.baseLang = baseMw.general.lang;
this.mws.set(baseMw.general.lang, baseMw);
this.updateIwMap();
return this;
}
/**
* Returns the `MediaWiki` for the given language, creating and initializing it if necessary, or `undefined` if it
* could not be created.
*
* @param lang the language of the `MediaWiki` to return
* @return the `MediaWiki` for the given language, or `undefined` if it could not be created
*/
async getMwOrWait(lang: string): Promise<MediaWiki | undefined> {
if (this.hasMw(lang))
return this.mws.get(lang);
if (!this.iwMap.hasUrl(lang))
return undefined;
const url = this.iwMap.getUrl(lang);
if (url === undefined) return undefined;
let newMw;
try {
newMw = await new MediaWiki(url.slice(0, -this.articlePath.length) + this.apiPath).init();
} catch (error) {
return undefined;
}
if (this.hasMw(newMw.general.lang)) {
// Duplicate MW with different but equivalent language code; destroy new MW instance
this.mws.set(lang, this.mws.get(newMw.general.lang)!);
} else {
this.mws.set(newMw.general.lang, newMw);
this.mws.set(lang, newMw);
}
this.updateIwMap();
return this.mws.get(lang);
}
/**
* Returns the `MediaWiki` for the given language or `undefined` if it has not created that object.
*
* @param lang the language of the `MediaWiki` to return
* @return the `MediaWiki` for the given language or `undefined` if it has not created that object
*/
getMw(lang: string): MediaWiki | undefined {
return this.mws.get(lang);
}
/**
* Returns `true` if and only if this manager has a `MediaWiki` for the given language.
*
* @param lang the language of the `MediaWiki` to check presence of
* @return `true` if and only if this manager has a `MediaWiki` for the given language
*/
hasMw(lang: string): boolean {
return this.mws.has(lang);
}
/**
* Returns the URL to the given article.
*
* @param link the link to return the URL of
* @return the URL to the given article
*/
getArticlePath(link: InterlangLink): URL {
const articlePath = this.iwMap.getUrl(link.lang);
if (articlePath === undefined) throw Error(`Could not find article path for '${link}'.`);
return new URL(articlePath.replace("$1", link.title));
}
/**
* Updates the `_iwMap` property with the entries in `MediaWiki` instances in this manager.
*
* @private
*/
private updateIwMap(): void {
this.iwMap = InterwikiMap.fromMap(
[...this.mws.values()]
.map(mw => mw.interwikiMap.map)
.reduce((combined, map) => new Map([...combined, ...map]), new Map())
);
}
}
/**
* Discovers the interlanguage network, starting from the given link.
*
* @param mwm the manager to use for caching and resolving pages
* @param title the title of the page to start traversing at
* @param errorCb a function handling errors and warnings
* @param progressCb a function handling progress updates
* @return the discovered network, including pages and redirects
*/
export const discoverNetwork = async function(
mwm: MediaWikiManager,
title: string,
errorCb: (level: "error" | "warning" | null, message: string) => void,
progressCb: (message: string) => void
): Promise<{ pages: Page[], redirects: Redirect[] }> {
const pages = [];
const redirects = [];
const history: InterlangLink[] = [];
const queue: InterlangLink[] = [new InterlangLink(mwm.baseLang, title)];
while (queue.length > 0) {
progressCb("Checking <code>" + queue[queue.length - 1] + "</code>");
let next = queue.pop()!;
if (history.some(it => it.equals(next)))
continue;
// Normalize
const nextMw = await mwm.getMwOrWait(next.lang);
if (nextMw === undefined) {
history.push(next);
pages.push(new Page(mwm.getArticlePath(next), next, [], false));
if (history.length === 1)
throw new Error(couldNotConnectMessage);
else {
errorCb("warning", `Could not connect to the wiki for language '${next.lang}'. Maybe the wiki no longer exists?`);
continue;
}
}
next = nextMw.normalize(next);
if (history.some(it => it.equals(next)))
continue;
else
history.push(next);
// Fetch interlang links
const result = await nextMw.getLangLinks(next.title);
if (result === undefined) {
pages.push(new Page(mwm.getArticlePath(next), next, [], false));
continue;
}
// Follow redirects
if (!result.link.equals(next)) {
redirects.push(...(result.redirects));
next = result.link;
if (history.some(it => it.equals(next)))
continue;
else
history.push(next);
}
// Create `Page` object
pages.push(new Page(mwm.getArticlePath(next), next, result.langLinks, true));
queue.push(...(result.langLinks));
}
// Normalize links
pages.forEach(page => {
page.langLinks.map((langLink, idx, self) => {
const mw = mwm.getMw(langLink.lang);
// Update link in place using `self[idx] = `
self[idx] = mw !== undefined ? mw.normalize(langLink) : langLink;
});
});
return {pages: pages, redirects: redirects};
};
/**
* The verdict that the checker has of a link between two pages.
*
* The possible values are listed in decreasing order of importance, so that if a single link has multiple verdicts but
* only one can be displayed, the one with the highest importance will be displayed.
*/
type LinkVerdict = "linked"
| "self-linked"
| "unlinked"
| "self-unlinked"
| "redirected"
| "wrongly-cased";
/**
* The verdict that the checker has of a page.
*
* The possible values are listed in decreasing order of importance, so that if a single page has multiple verdicts but
* only one can be displayed, the one with the highest importance will be displayed.
*/
type PageVerdict =
"perfect"
| "not-found"
| "wrongly-ordered"
| "doubly-linked"
| "self-linked"
| "unlinked"
| "redirected"
| "wrongly-cased";
/**
* The verdict that the checker has of a network.
*/
type NetworkVerdict =
| "perfect"
| "flawed"
| "broken";