Translating Dictionary
This is a translating and disambiguating dictionary, which uses raw wikipedia data to get the translations and disambiguations. I wrote it (Simon Kissane).
<?
class WikipediaArticle {
var $lang;
var $title = "";
var $text;
var $in;
function getURL($lang) {
return "http://{$lang}.wikipedia.org/wiki/Special%3AExport/";
}
function WikipediaArticle($art,$lang="en") {
$this->lang = $lang;
$this->get_article($art,$lang);
}
function get_article_xml($art,$lang) {
return file_get_contents($this->getURL($lang) . urlencode($art));
}
function xml_start($xp,$tag,$attrs) {
if ($tag == "title")
$this->in = "title";
if ($tag == "text")
$this->in = "text";
}
function xml_end($xp,$tag) {
if ($this->in == $tag)
$this->in = NULL;
}
function xml_cdata($xp,$data) {
if ($this->in == "title")
$this->title .= $data;
if ($this->in == "text")
$this->text .= $data;
}
function get_article($art,$lang="en") {
$xml = $this->get_article_xml($art,$lang);
$xp = xml_parser_create();
xml_parser_set_option($xp, XML_OPTION_CASE_FOLDING, 0);
xml_set_element_handler($xp,
array(&$this,"xml_start"),
array(&$this,"xml_end"));
xml_set_character_data_handler($xp, array(&$this,"xml_cdata"));
xml_parse($xp,$xml,TRUE);
}
function getLangs() {
return array("aa" => "Afar",
"ab" => "Abkhazian",
"af" => "Afrikaans",
"als" => "Alsatian",
"am" => "Amharic",
"an" => "Aragonese",
"ar" => "Arabic",
"as" => "Assamese",
"ast" => "Asturian",
"ay" => "Aymara",
"az" => "Azeri",
"ba" => "Bashkir",
"bal" => "Baluchi",
"ban" => "Balinese",
"be" => "Belorussian",
"ber" => "Berber (Tamazight)",
"bg" => "Bulgarian",
"bh" => "Bihari",
"bi" => "Bislama",
"bn" => "Bengali",
"bo" => "Tibetan",
"br" => "Breton",
"bs" => "Bosnian",
"bug" => "Buginese",
"ca" => "Catalan",
"ceb" => "Cebuano",
"ch" => "Chamoru",
"che" => "Chechen",
"chm" => "Meadow Mari",
"chr" => "Cherokee",
"chv" => "Chuvash",
"co" => "Corsican",
"crh" => "Crimean Tatar",
"cs" => "Czech",
"csb" => "Kashubian",
"cy" => "Welsh",
"da" => "Danish",
"de" => "German",
"diu" => "Diudish",
"div" => "Dhivehi",
"dz" => "Dzongkha",
"el" => "Greek",
"en" => "English",
"eo" => "Esperanto",
"es" => "Spanish",
"eso" => "Ekspreso",
"et" => "Estonian",
"eu" => "Basque",
"fa" => "Persian",
"fi" => "Finnish",
"fiu" => "Karelian",
"fj" => "Fijian",
"fo" => "Faeroese",
"fr" => "French",
"fy" => "Frisian",
"ga" => "Irish",
"gay" => "Gayo",
"gd" => "Scottish Gaelic",
"gl" => "Galician",
"gn" => "Guarani",
"gs" => "Glosa",
"gu" => "Gujarati",
"gv" => "Manx Gaelic",
"ha" => "Hausa",
"he" => "Hebrew",
"hi" => "Hindi",
"hr" => "Croatian",
"hu" => "Hungarian",
"hy" => "Armenian",
"ia" => "Interlingua",
"iba" => "Iban",
"id" => "Indonesian",
"ie" => "Interlingue (ex occidental)",
"ik" => "Inupiak",
"is" => "Icelandic",
"it" => "Italian",
"iu" => "Inuktitut",
"ja" => "Japanese",
"jv" => "Javanese",
"ka" => "Georgian",
"kaw" => "Kawi",
"kk" => "Kazakh",
"kl" => "Greenlandic",
"km" => "Khmer",
"kn" => "Kannada",
"ko" => "Korean",
"ks" => "Kashmiri",
"ku" => "Kurdish",
"ky" => "Kirghiz (also Kyrgyz)",
"la" => "Latin",
"li" => "Limburgian",
"ln" => "Lingala",
"lo" => "Laotian",
"ls" => "Latino Sine Flexione",
"lt" => "Lithuanian",
"lv" => "Latvian",
"mad" => "Madurese",
"mak" => "Makasar",
"mg" => "Malagasy",
"mi" => "Maori",
"min" => "Minangkabau",
"mk" => "Macedonian",
"ml" => "Malayalam",
"mn" => "Mongolian",
"mo" => "Moldovan",
"mr" => "Marathi",
"ms" => "Malay",
"mt" => "Maltese",
"my" => "Burmese",
"na" => "Nauri",
"nah" => "Nahuatl",
"nds" => "Low Saxon",
"ne" => "Nepali",
"ng" => "Ndongo",
"nl" => "Dutch",
"no" => "Norwegian",
"oc" => "Occitan",
"om" => "Oromo",
"or" => "Oriya",
"pa" => "Punjabi",
"pl" => "Polish",
"ps" => "Pashto",
"pt" => "Portuguese",
"qu" => "Quechua",
"ra" => "Romanica",
"rm" => "Rhaeto-Romance",
"rn" => "Kirundi",
"ro" => "Romanian",
"roa-rup" => "Aromanian",
"ru" => "Russian",
"rw" => "Kinyarwanda",
"sa" => "Sanskrit",
"sc" => "Sardinian",
"sd" => "Sindhi",
"sg" => "Sangro",
"sh" => "Serbo-Croatian",
"si" => "Singhalese",
"sk" => "Slovak",
"sl" => "Slovene",
"sm" => "Samoan",
"sn" => "Shona",
"son" => "Songhay",
"sq" => "Albanian",
"sr" => "Serbian",
"ss" => "Siswati",
"st" => "Sesotho",
"su" => "Sundanese",
"sv" => "Swedish",
"sw" => "Swahili",
"ta" => "Tamil",
"te" => "Telugu",
"tg" => "Tajik",
"th" => "Thai",
"ti" => "Tigrinya",
"tk" => "Turkmen",
"tl" => "Tagalog",
"tlh" => "Klingon",
"tn" => "Setswana",
"to" => "Tonga",
"tokipona" => "Toki Pona",
"tpi" => "Tok Pisin",
"tr" => "Turkish",
"ts" => "Tsonga",
"tt" => "Tatar",
"tw" => "Twi",
"tzm" => "Tamazight",
"udm" => "Udmurt",
"ug" => "Uighur",
"uk" => "Ukrainian",
"ur" => "Urdu",
"uz" => "Uzbek",
"vi" => "Vietnamese",
"vo" => "Volapuk",
"wo" => "Wolof",
"xh" => "Xhosa",
"yi" => "Yiddish",
"yo" => "Yoruba",
"za" => "Zhuang",
"zh" => "Chinese",
"zh-cfr" => "Min-nan",
"zu" => "Zulu");
}
function decodeLang($lang) {
$langs = $this->getLangs();
if (array_key_exists($lang,$langs))
return $langs[$lang];
else
return $lang;
}
function form($word="",$askLang="en") {
echo "<form method='get' action=''>\n";
echo "<input type='text' name='word' id='word' value='{$word}' />\n";
echo "<select name='lang'>\n";
$langs = WikipediaArticle::getLangs();
asort($langs);
foreach ($langs as $code => $name) {
echo "<option value=\"$code\"";
if ($code == $askLang)
echo " selected ";
echo ">$name</option>\n";
}
echo "</select>\n";
echo "<input type='submit' value='Translate' />\n";
echo "</form>\n";
}
function getRelatedTerms() {
preg_match_all("/\[\[([^]:]+)\|([^]:]+)\]\]/",$this->text,$matches);
$terms = array();
foreach ($matches[1] as $term) {
$term = urldecode(str_replace("_"," ",$term));
if (!preg_match("/^[0-9]+$|^#/",$term))
array_push($terms,$term);
}
preg_match_all("/\[\[([^]:|]+)\]\]/",$this->text,$matches);
foreach ($matches[1] as $term) {
$term = urldecode(str_replace("_"," ",$term));
if (!preg_match("/^[0-9]+$|^#/",$term))
array_push($terms,$term);
}
return array_unique($terms);
}
function getTranslations() {
$trans = array();
preg_match_all("/\[\[([a-z][a-z][a-z]?(?:-[^:]*)?):([^]]*)]]/",$this->text,$matches);
for ($i = 0; $i < count($matches[1]); $i++) {
$word = urldecode($matches[2][$i]);
$lang = $this->decodeLang($matches[1][$i]);
$trans[$matches[1][$i]] = array("word" => $word,"lang" => $lang);
}
return $trans;
}
function getDisambiguations() {
$u = new WikipediaArticle($this->title . " (disambiguation)",$this->lang);
$disambig = $u->getRelatedTerms();
if (strpos($this->text,"{{disambig}}") !== FALSE)
$disambig = array_merge($this->getRelatedTerms(),$disambig);
foreach ($disambig as $i => $tm)
if (stripos($tm,$_REQUEST["word"]) === FALSE)
unset($disambig[$i]);
return array_unique($disambig);
}
}
header("Content-Type: text/html; charset=UTF-8");
?>
<html>
<head>
<title>Translating and Disambiguating Dictionary</title>
<style>
body { background: #ffcc77; }
h1, h2 { color: #770077; }
</style>
</head>
<body>
<?
if (array_key_exists("source",$_REQUEST)) {
show_source($_SERVER['SCRIPT_FILENAME']);
exit;
}
if (array_key_exists("lang",$_REQUEST) && $_REQUEST["lang"] != "") {
$askLang = $_REQUEST["lang"];
}
else {
$askLang = "en";
}
if (!array_key_exists("word",$_REQUEST) ||
$_REQUEST["word"] == "") {
echo "<h1>Translating Dictionary</h1>\n";
echo "<p>Written by Simon Kissane</p>\n";
WikipediaArticle::form("",$askLang);
echo "Based on <a href=\"http://www.wikipedia.org\">Wikipedia</a><br />\n";
}
else {
$a = new WikipediaArticle($_REQUEST["word"],$askLang);
echo "<h1>Translating Dictionary: {$a->title}</h1>\n";
echo "<p>Written by Simon Kissane</p>\n";
WikipediaArticle::form($_REQUEST["word"],$askLang);
// Find synonym
if (preg_match("/#REDIRECT\s*\[\[([^]]*)\]\]/",$a->text,$matches)) {
echo "<p><b>Synonym of:</b> <a href=\"?word={$matches[1]}&lang=$askLang\">{$matches[1]}</a></p>\n";
}
// Find translations
$trans = $a->getTranslations();
echo "<h2>Translations</h2>\n";
if (count($trans) == 0)
echo "<p>No translations available</p>\n";
else {
echo "<table>\n";
echo "<tr><th>Word</th><th>Language</th></tr>\n";
foreach ($trans as $code => $t) {
echo "<tr><td><a href=\"?word={$t['word']}&lang={$code}\">{$t['word']}</a></td><td>{$t['lang']}</td></tr>\n";
}
echo "</table>\n";
}
// Find disambiguations
$disambig = $a->getDisambiguations();
echo "<h2>Disambiguations</h2>\n";
if (count($disambig) == 0)
echo "<p>No disambiguations known.</p>\n";
foreach ($disambig as $tm) {
echo "<a href=\"?word={$tm}&lang={$askLang}\">{$tm}</a><br />\n";
}
echo "<p>Based on the <a href='http://{$askLang}.wikipedia.org/wiki/" . urlencode($_REQUEST["word"]) . "'>Wikipedia article</a>.</p>\n";
}
echo "<p><a href=\"?source\">View source</a></p>\n";
?>
</body>
</html>