Wikificator/html2wiki.js
/*
Wikificator - an HTML to Mediawiki converter and Typography processor in JavaScript Version 0.3.1 Last update: --17:42, 25 August 2006 (UTC) by Shtriter Andrew, http://meta.wikimedia.org/wiki/User:Shtriter
The Wikificator's main part.
It includes:
- Selection handling
- Html2Wiki transformation function
- English messages ( can be replaced in local projects )
- ProcessTypography function prototype ( should be replaced in local projects )
To include it to your project write the following in your project's js file:
mw.loader.load( '//meta.wikimedia.org/w/index.php?title=Wikificator/Wikificator.js&action=raw&ctype=text/javascript' );
Changelog: edit
2do list:
- Add HTML 2 inline wiki transformations
- Changelog filling
- UnEscapeInPre - replace with doHTML Entities.
- Clean the code like FCK does
- Extend the Html2Wiki - function including:
- < html >
- < nowiki >
- < math >
- < pre >
- < gallery >
- < comment >
- [[Image:]] [[Media:]]
- {{ }} and {{{ }}}
- < noinclude > & < includeonly >
- MagicWords
- < User Tags >
The file's content itself:
*/
wmFullText = 'Wikificator will process entire article\'s text. Do you want to proceed?';
wmCantWork = 'Wikificator cannot work in your browser';
wmWontWork = 'Wikificator will not work in Netscape 4.x and less';
wmCategoryNS = 'Category';
wmTemplateNS = 'Template';
wmUserNS = 'User';
wmImageNS = 'Image';
wmMediaNS = 'Media';
var wmIntLinkPat = '/wiki/';
//'/wiki/'
//'/index.php/'
//'/w?title='
//'/index.php?title='
var wmLocaleNS = new Array ( wmCategoryNS, wmTemplateNS, wmUserNS, wmImageNS, wmMediaNS );
var wmEnNS = new Array ( 'category', 'template', 'user', 'image', 'media');
//======================================
function Wikify()
{
check_regexp(); // Check whether regular expressions are supported
var txtarea = document.editform.wpTextbox1;
txtarea.focus();
if(document.selection && !is_gecko)/* IE */ {
txt = " "+document.selection.createRange().text;
if (txt == " ") {all_text();} // If nothing was selected;
else{
txt = Process(txt);
txt = txt.substr (1, txt.length-1);
document.selection.createRange().text = txt;
}
}
else if((txtarea.selectionStart || txtarea.selectionStart == '0')&&(navigator.productSub>20031000)) /*Gecko-browsers older then 10.2003*/ {
var startPos = txtarea.selectionStart;
var endPos = txtarea.selectionEnd;
var scrollTop=txtarea.scrollTop;
txt = " "+(txtarea.value).substring(startPos, endPos);
if (txt == " ") {all_text();} // If nothing was selected;
else{
txt = Process(txt);
txt = txt.substr (1, txt.length-1);
txtarea.value = txtarea.value.substring(0, startPos) + txt + txtarea.value.substring(endPos, txtarea.value.length);
txtarea.focus();
}
}
else{if (confirm(wmFullText)) {all_text();}} // Other browsers
}
//======================================
function all_text()// Process all text
{
txt = " "+document.editform.wpTextbox1.value;
txt = Process(txt);
txt = txt.substr (1, txt.length-1);
document.editform.wpTextbox1.value=txt;
}
//======================================
function check_regexp()// Check whether regular expressions are supported
{
var reg1 = "code";
reg1 = reg1.replace(/d/g, "r");
if (reg1 != "core"){alert(wmCantWork);exit;}
b_ver = navigator.appVersion.substr (0, 1);
if (navigator.appName=="Netscape"&&b_ver<5){alert(wmWontWork);exit;}
return ;
}
function Process( t )
{
//alert('In Process() begining:\n"' + t + '"')
t=t.replace(/<br style="display: none;" \/>/gi, '\n');
//alert('After new-line tokens replacement:\n"' + t + '"')
//alert('After FCK:\n"' + t + '"')
//RegExp patterns for:
var patterns = new Array (
pat4tags('nowiki'), // < nowiki >
pat4tags('pre'), // < pre >
pat4tags('math'), // < math >
pat4tags('gallery'), // < gallery >
"\\{\\{(.|\\r|\\n)+?\\}\\}", // templates {{ }}
"(\\[\\[)(.*?)(\\||\\]\\])", // links [[ ]] (or [[ | )
'(=)(\\s?)(\\")(.*?)(\\")', // attributes in quote ( =" )
"<([^>]*?)>", // other tags
"^( )(.+)$" // lines that start with space
);
// We have 3 more pairs of safe chars in \x1A \x1F !
var save_pair = new Array(
"\x01\x02",
"\x03\x04",
"\x05\x06",
"\x0E\x0F",
"\x10\x11",
"\x12\x13",
"\x14\x15",
"\x16\x17",
"\x18\x19"
);
// Buffer for replaced matches storage. It is list of arrays of matching substrings.
var matches = new Array();
// RegExp template to be replaced (multiline, case sensitive)
var re = '';
// Convert html representation to wikitext
t = Html2Wiki ( t );
//======================================
// Replace all occurances of RegExp patterns from ''patterns'' array in t
// with matches counter surraunded by the pair of "safe" chars ('\x03'+1'+'\x04')
//======================================
for (var i in patterns)
{
var counter = 0; //matches counter
if ( i == 3) // Run functions before wikilinks replacement
{
//alert('Before NS:' + t)
// Process default namespaces
t = ProcessNS( t, wmEnNS , wmLocaleNS );
//alert('After NS:' + t)
t = CorrectRanges( t );
//alert('After Ranges:' + t)
}
if ( i == patterns.length - 1) // Remove the first space before the spaced lines processing
{
// Exclude lines starting with space
f_space = t.substr (0, 1);
t = t.substr (1, t.length-1);
}
//alert("i="+i+'\nt='+t)
// add the matches we've found (global multi-line case-insansitive) to common array
matches[i] = t.match( new RegExp( patterns[i] , "gim") );
// create new non-global but multiline RegExp
re = new RegExp( patterns[i] , "im");
// while some substring of text matches with given RegExp ...
// replace the occurance with the matches counter
// surraunded by the pair of "safe" chars ('\x03'+1'+'\x04')
while (re.test(t)) t = t.replace( re, save_pair[i][0] + ++counter + save_pair[i][1] );
}
// Restore the first space that was delated before the spaced lines processing
t = f_space + t;
// Do the Typography staff
t = ProcessTypography( t );
//======================================
// Restore damaged text by replacing 2 "save" chars and number between them with substring from array
//======================================
for ( ; i > -1; i-- )
{
var counter = 0; //reset matches counter
// create new non-global but multiline RegExp
re = new RegExp( pat4chars( save_pair[i] ) , 'm' );
// while some substring of text matches with given RegExp ...
// replace 2 "save" chars and number between them with substring from matches array
while ( re.test(t) ) t = t.replace( re, matches[i][counter++] );
}
// Unescape text between < pre > tags
//alert('Match for < pre >:\n' + t.match(/(<pre>)((?:.|\s)+?)(<\/pre>)/gim) )
t = t.replace( /(<pre>)((?:.|\s)+?)(<\/pre>)/gim, UnEscapeInPre);
//remove space from the line's end
t = t.replace(/^(.*)\s*&/gm, '$1');
alert(t)
return t;
}
function Html2Wiki( t )
{
function doExtLinks($0, $1, $2, $3)
{
//alert($0 + ', ' + $1 + ', ' + $2 + ', ' + $3)
if ($1 !== $3) return '['+$1+' '+$3+']'
return $1;
}
// Replace html representation of wikitags with wikitext
//alert(t)
t = ProcessImages( t );
// Process the categories
t = t.replace( RegExp('<span dir="ltr" style="display:none"><a href="'+wmIntLinkPat+'(?:.+?)" title="(Category|'+wmCategoryNS+'):(.+?)">\\2<\\/a><\\/span>' , 'gim' ), '[[$1:$2]]');
t = t.replace( RegExp('<div id="catlinks"><p class="catlinks">(?:.|\s)+?</p></div>' , 'gim'), '')
// Make internal links ( [[...]] ) from html
t = t.replace( RegExp('<a href="'+wmIntLinkPat+'(?:.+?)" title="(.+?)">\\1<\\/a>', 'gim') , '[[$1]]')
t = t.replace( RegExp('<a href="'+wmIntLinkPat+'(?:.+?)" title="(.+?)">\\1([a-zа-яё]*)<\\/a>', 'gim') , '[[$1]]$2')
t = t.replace( RegExp('<a href="'+wmIntLinkPat+'(?:.+?)" title="(.+?)">(.+?)<\\/a>', 'gim') , '[[$1|$2]]')
// Make external links ( [...] ) from html
t = t.replace( RegExp('<a href="(.+?)"(?:.*?)>\[\d+\]<\\/a>', 'gim') , '[$1]' )
t = t.replace( RegExp('<a href="(.+?)"(.*?)>(.+?)<\\/a>', 'gim') , doExtLinks )
// Replace html headers <H?> with equal signs ={?}
t = t.replace( /^<h([1-6])>(.+)<\/h\1>/gim, pat4heads );
// Process linebreaks
t = t.replace(/<p>(\s)*<br.*>/gim, '\n');//'\n\n');
// Replace paragraphs
t = t.replace(/^<\/p>\n/gim, '');
t = t.replace(/<\/?p\s*>/gim, '');
// Replace <b>, <strong> tags with ''' and <i>, <em> with ''
t = t.replace(/\<\/?(b|strong)\>/gim, "\'\'\'");
t = t.replace(/\<\/?(i|em)\>/gim, "\'\'");
// Replace <hr> tag with ----, improve <hr> and <br> tags
t = t.replace(/\<hr ?\/?\>/gi, "----");
t = t.replace(/\<hr ([^\>\/]+?) ?\/?\>/gi, "<hr $1 />");
t = t.replace(/\<br ?\/?\>/gi, "<br/>");
t = t.replace(/\<br ([^\>\/]+?) ?\/?\>/gi, "<br $1 />");
t = ProcessTables( t );
t = ProcessLists( t );
/**///alert('After html2wiki:\n"'+t+'"');
return t;
}
//**********************************************************
function ProcessTables( t )
{
//alert("Before tables: \n"+t);
t = t.replace(/ *\<table ?\>/gim, "{|");
t = t.replace(/ *\<table ?([^\/]+?) ?\>/gim, "{| $1");
t = t.replace(/ *\<caption ?\>(.*)\<\/caption ?\>/gim, "|+ $1");
t = t.replace(/ *\<\/?tbody ?([^\/]+?)? ?\>\n?/gim, "");
t = t.replace(/ *\<\/table ?\>/gim, "|}");
//ProcessRows();
//alert("Before rows: \n"+t);
t = t.replace(/ *\<tr ?\>/gim, "|-");
t = t.replace(/ *\<tr ?([^\/]+?) ?\>/gim, "|- $1");
t = t.replace(/ *\<\/tr ?\>\n/gim, "");
//ProcessCells();
//alert("Before cells: \n"+t);
t = t.replace(/ *\<th ?\>/gim, "!");
t = t.replace(/ *\<th ?([^\/]+?) ?\>/gim, "! $1 |");
t = t.replace(/ *\<\/th ?\>/gim, "");
t = t.replace(/ *\<td ?\>/gim, "|");
t = t.replace(/ *\<td ?([^\/]+?) ?\>/gim, "| $1 |");
t = t.replace(/ *\<\/td ?\>/gim, "");
//alert("After all: \n"+t);
return t;
}
function ProcessLists( t )
{
//alert('"'+t+'"');
var lines = t.split('\n');
t='';
var char = new Array ('*', '#', ';', ':');
var opening = new Array (/<ul>\s*<li>/gi, /<ol>\s*<li>/gi, /<dl>\s*<dt>/gi, /<dl>\s*<dd>/gi);
var items = new Array (/<\/li>\s*<li>/gi, /<\/dt>\s*<dt>/gi, /<\/dd>\s*<dt>/gi);
var closing = new Array (/<\/li>\s*<\/ul>/gi, /<\/li>\s*<\/ol>/gi, /<\/dt>\s*<\/dl>/gi, /<\/dd>\s*<\/dl>/gi);
var colons = new Array (/<\/dt>\s*<dd>/gi, /<\/dd>\s*<dd>/gi);
//********var DT = false;
var prefix = '';
var pref2 = '';
for (var l in lines)
{
line = lines[l];
for (var i = 0; i < char.length; i++)
{
//open lists
if (opening[i].test(line))
{
//add the symbol of the list when starting new one
prefix += char[i];
line = line.replace(opening[i], prefix);
}
//continue lists
if ( i < 3 ) // avoid non-existant errors
{
// next list item found
if (items[i].test(line))
{
//decrease 2nd preffix if </dd><dt> is found
if ( i == 2 ) pref2 = pref2.substr (0, pref2.length-1);
line = line.replace(items[i], prefix);
};
//if dl continues and has colons...
if ( i != 0 && colons[i-1].test(line))
{
pref2 += ':';
line = line.replace(colons[i-1], pref2);
}
}
//close lists
if (closing[i].test(line))
{
prefix = prefix.substr (0, prefix.length-1);
//alert('"'+line+'"');
//hacky trick to delete blank lines
line = line.replace(closing[i], '\x1A');
//line = line.replace(closing[i], '');
//decrease 2nd preffix if </dd><dl> is found
if ( i == 3 ) pref2 = pref2.substr (0, pref2.length-1);
}
}
//add current line to txt
t += line;
//if the line isn't the last - add line break
if (lines[++l] != undefined) t += '\n';
//alert('Line: \''+line+'\'\n Full text:\n"""'+t+'"""');
}
t = t.replace(/^\x1A\n?/gm, '');
//alert(t);
//remove space from the line's end
t = t.replace(/^(.*)\s*&/, '$1');
return t;
}
function ProcessImages( t )
{
//[[Image:Test.PNG|250px|frame|center|Caption]]
//=
//<div class="center"><div class="thumb tnone"><div style="width: 810px;"><a href="/index.php/Image:Test.PNG" class="internal" title="Caption"><img src="/images/9/9f/Test.PNG" alt="Caption" longdesc="/index.php/Image:Test.PNG" height="581" width="808"></a><div class="thumbcaption">Caption</div></div></div></div>
var img= RegExp(
//if center
"(?:<div class=['\"](center)['\"]>[ \n\r]*)?" + //$1 - center|undef
// if frame or float
"(?:<div class=['\"]" +
// if frame use "thumb t" + "align"
// else if simple align (without frame) - use "float" + "align"
//$2 - thumb t|float|undef
//$3 - right|left|none|undef
"(?:(thumb t|float)(right|left|none))?['\"]>[ \n\r]*" +
//if frame (div) or align (span)
// simple div for image (+ witdth for thumbnail if ['\"]thumb['\"] was specified)
"(?:<div.*?>|<span>)?[ \n\r]*" +
")?" +
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
//always
// $4 - ImageName
// $5 - Img caption (if was set)
// $6 - Img width (if was set)
"[ \n\r]*<a href=['\"].+?:(.+?)['\"].*?>[ \n\r]*<img .*?src=['\"].+?['\"]" +
"(?: alt=['\"](.+?)['\"]|" +
//" longdesc=['\"].+?:(.+?)['\"]|" +
" width=['\"](.+?)['\"]|" +
".+?)*?" +
">[ \n\r]*</a>" +
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
//only if frame !!!
"(?:[ \n\r]*<div.*?>" +
// only if frame && thumb !!!
"(?:[ \n\r]*<div(?:.|\n|\r)*?</div>)?" +
"(?:.|\n|\r)*?</div>)?" +
//if frame or align !!
// simple div for image (+ witdth for thumbnail if 'thumb' was specified)
// or span if align only
"[ \n\r]*(?:</div>|</span>)?" +
"(?:[ \n\r]*</div>)?" + //thumb or float
".*?" +
"(?:[ \n\r]*</div>)?" // center
,
'gim');
alert('Img.match:\n'+t.match( img ) );
t = t.replace( img , MakeImage);
//alert(t)
return t;
}
// $1 - center|undef
// $2 - thumb t|float|undef
// $3 - right|left|none|undef
// $4 - ImageName
// $5 - Img caption (if was set)
// $6 - Img width (if was set)
function MakeImage( $0, $1, $2, $3, $4, $5, $6)
{
/*
alert( $1+", "+$2+", "+$3+"\n" +
$4+", "+$5+", "+$6+"\n" )
*/
var t;
t='[['+wmImageNS+':'+$4
//go thoug args keeping an order if possible
if ($1) t+='|'+$1;
if ($2 == 'thumb t') t+='|'+'thumbnail';
if ($3) t+='|'+$3;
if ($6) t+='|'+$6+'px';
if ($5) t+='|'+$5;
//caption must be the last
t+=']]';
//alert(t);
return t;
}
//***********************************************************
// Process default namespaces
function ProcessNS( t, En_NS_List , Loc_NS_List )
{
for (i=0; i < En_NS_List.length; i++)
{
var pat = "(\\[\\[:?)(?:" + En_NS_List[i] + "|" +
Loc_NS_List[i] + "):(.*)";
var re = new RegExp( pat , "gi" )
//alert(t.match(re) + '\n' + pat);
t = t.replace( re , "$1" + Loc_NS_List[i] + ":$2");
}
return t;
}
function UnEscapeInPre( $0, $1, $2, $3 )
{
function html_chars(s) { return s.replace(/</g,"<").replace(/>/g,">").replace(/&/g,"&") }
//alert('"' + $0 + '", "' + $1 + '", "' + $2 + '", "' + $3 + '"');
//alert('"' + $1 + '", "' + html_chars($2) + '", "' + $3 + '"');
return $1+ html_chars($2) + $3;
}
//***********************************************************
// RegExp pattern for given tag
function pat4tags( tag_name )
{
return "\\<" + tag_name + "\\>(.|\r|\n)+?\\<\\/" + tag_name + "\\>";
}
//***********************************************************
// RegExp pattern for "save" pair of chars
function pat4chars( save_chars )
{
return "\\" + save_chars[0] + "([0-9]*)\\" + save_chars[1];
}
//***********************************************************
// Pattern for the string that replaces html headers with equal signs
function pat4heads( $0, $1, $2 )
{
//alert( $0+', '+$1+', '+$2 )
var t=strcopy( '=', $1);
return t+$2+t;
}
//***********************************************************
// Copy given str n times
function strcopy( str, n )
{
pat = str;
for (i=1; i<n; i++) str += pat
return str;
}
//***********************************************************
// Corrects year and century ranges in text
function CorrectRanges( t ) {
// Correct year ranges
t = t.replace(/(\(|\s)(\[\[[12]?\d{3}\]\])[\u00A0 ]?(-|--|–|—) ?(\[\[[12]?\d{3}\]\])(\W)/g, "$1$2—$4$5")
t = t.replace(/(\[\[[12]?\d{3}\]\]) ?(г\.|гг\.)/g, "$1\u00A0$2")
// Correct century ranges
t = t.replace(/(\(|\s)(\[\[[IVX]{1,5}\]\])[\u00A0 ]?(-|--|–|—) ?(\[\[[IVX]{1,5}\]\])(\W)/g, "$1$2—$4$5")
t = t.replace(/(\[\[[IVX]{1,5}\]\]) ?(в\.|вв\.)/g, "$1\u00A0$2")
return t;
}
//***********************************************************
// Prototype of ProcessTypography( t ) function
function ProcessTypography( t )
{
return t;
}
//