User:Zeno Gantner/Perl hacks
I used these scripts to enhance an existing translation dictionary with Wikipedia interlink word pairs.
All you need is a working Perl installation and the current Wikipedia database dump in a MySQL database.
The programs are just quick hacks, so no documentation and no warranty whatsoever. Additionally licensed under GNU GPL, Version 2 or later, as published by the Free Software Foundation. --Zenogantner 17:23, 1 Dec 2004 (UTC)
I think these scripts won't work for newer MediaWiki versions like 1.5, as the database schema was changed. --zeno 12:21, 30 Jun 2005 (UTC)
extract_articles.pl
edit#!/usr/bin/perl -w # Extracts all real articles from the Wikipedia database # Only saves the article ID, the title and the article's text # into the new table. use DBI; my $user = 'db_user'; my $password = 'db_user_pw'; my $step = 1000; my $handle = DBI->connect("dbi:mysql:wikipedia_en", $user, $password); my $query = $handle->prepare(<<SQL); SELECT MAX(cur_id) FROM cur SQL $query->execute; my ($max) = $query->fetchrow_array(); print "Highest article id is $max\n"; my $drop = $handle->prepare(<<SQL); DROP TABLE articles SQL $drop->execute; my $create = $handle->prepare(<<SQL); CREATE TABLE articles ( counter INT(8) UNSIGNED, cur_id INT(8) UNSIGNED, cur_title VARCHAR(255) BINARY, cur_text MEDIUMTEXT, PRIMARY KEY (counter)) SQL $create->execute; $query = $handle->prepare(<<SQL); SELECT cur_id, cur_title, cur_text FROM cur WHERE cur_namespace = 0 AND cur_is_redirect = 0 AND cur_id >= ? AND cur_id < ? SQL $ins = $handle->prepare(<<SQL); INSERT INTO articles (counter, cur_id, cur_title, cur_text) VALUES (?, ?, ?, ?) SQL $max++; # increment max to have the last row included (if max mod step == 0) my $counter = 0; for (my $low = 0; $low < $max; $low += $step) { my $high = $low + $step; print "Query from id $low to $high: "; print 'Now '; $query->execute($low, $high); while (my @row = $query->fetchrow_array()) { my ($id, $title, $text) = @row; $ins->execute($counter++, $id, $title, $text); } print "$counter entries in the database\n"; }
extract_commas.pl
edit#!/usr/bin/perl -w # extract articles that have commas in their name (mostly geographical names, # book/movie/album titles, persons) # This script will most likely make sense if you use English data, as # English town names often have the form # TOWN, STATE or TOWN, COUNTRY my @target_languages = ('de', 'es', 'ca', 'fr'); use DBI; my $user = 'root'; my $password = 'root'; my $step = 1000; my $handle = DBI->connect("dbi:mysql:wikipedia_en", $user, $password); my $query = $handle->prepare(<<SQL); SELECT MAX(counter) FROM interlinks SQL $query->execute; my ($max) = $query->fetchrow_array(); print "Highest article id is $max\n"; my $drop = $handle->prepare(<<SQL); DROP TABLE comma_articles SQL $drop->execute; $drop = $handle->prepare(<<SQL); DROP TABLE noncomma_articles SQL $drop->execute; $drop = $handle->prepare(<<SQL); DROP TABLE paranthese_articles SQL $drop->execute; my $create = $handle->prepare(<<SQL); CREATE TABLE comma_articles ( counter INT(8) UNSIGNED, cur_id INT(8) UNSIGNED, cur_title VARCHAR(255) BINARY, $target_languages[0] VARCHAR(255) BINARY, $target_languages[1] VARCHAR(255) BINARY, $target_languages[2] VARCHAR(255) BINARY, $target_languages[3] VARCHAR(255) BINARY, PRIMARY KEY (counter)) SQL $create->execute or die; $create = $handle->prepare(<<SQL); CREATE TABLE noncomma_articles ( counter INT(8) UNSIGNED, cur_id INT(8) UNSIGNED, cur_title VARCHAR(255) BINARY, $target_languages[0] VARCHAR(255) BINARY, $target_languages[1] VARCHAR(255) BINARY, $target_languages[2] VARCHAR(255) BINARY, $target_languages[3] VARCHAR(255) BINARY, PRIMARY KEY (counter)); SQL $create->execute; $create = $handle->prepare(<<SQL); CREATE TABLE paranthese_articles ( counter INT(8) UNSIGNED, cur_id INT(8) UNSIGNED, cur_title VARCHAR(255) BINARY, comment VARCHAR(255) BINARY, $target_languages[0] VARCHAR(255) BINARY, $target_languages[1] VARCHAR(255) BINARY, $target_languages[2] VARCHAR(255) BINARY, $target_languages[3] VARCHAR(255) BINARY, PRIMARY KEY (counter)); SQL $create->execute; $query = $handle->prepare(<<SQL); SELECT cur_id, cur_title, $target_languages[0], $target_languages[1], $target_languages[2] FROM interlinks WHERE counter >= ? AND counter < ? SQL my $ins1 = $handle->prepare(<<SQL); INSERT INTO comma_articles (counter, cur_id, cur_title, $target_languages[0], $target_languages[1], $target_languages[2], $target_languages[3]) VALUES (?, ?, ?, ?, ?, ?, ?); SQL my $ins2 = $handle->prepare(<<SQL); INSERT INTO noncomma_articles (counter, cur_id, cur_title, $target_languages[0], $target_languages[1], $target_languages[2], $target_languages[3]) VALUES (?, ?, ?, ?, ?, ?, ?); SQL my $ins3 = $handle->prepare(<<SQL); INSERT INTO paranthese_articles (counter, cur_id, cur_title, comment, $target_languages[0], $target_languages[1], $target_languages[2], $target_languages[3]) VALUES (?, ?, ?, ?, ?, ?, ?, ?); SQL $max++; # increment max to have the last row included (if max mod step == 0) my $comma_counter = 0; my $noncomma_counter = 0; my $paranthese_counter = 0; for (my $low = 0; $low < $max; $low += $step) { my $high = $low + $step; print "Query from id $low to $high: "; $query->execute($low, $high) or die; while (my @row = $query->fetchrow_array()) { my ($id, $title, $l1, $l2, $l3, $l4) = @row; if ($title =~ /,/) { $ins1->execute($comma_counter++, $id, $title, $l1, $l2, $l3, $l4) or die; } else { $ins2->execute($noncomma_counter++, $id, $title, $l1, $l2, $l3, $l4) or die; if ($title =~ /\((.+)\)/ ) { $ins3->execute($paranthese_counter++, $id, $title, $1, $l1, $l2, $l3, $l4) or die; } } } print "$comma_counter article names with comma, $noncomma_counter without. $paranthese_counter article titles with comments in parantheses\n"; }
extract_interlinks.pl
edit#!/usr/bin/perl -w my $source_language = 'en'; my @target_languages = ('de', 'es', 'ca', 'fr'); use DBI; my $user = 'root'; my $password = 'root'; my $step = 1000; my $handle = DBI->connect("dbi:mysql:wikipedia_en", $user, $password); my $query = $handle->prepare(<<SQL); SELECT MAX(counter) FROM articles SQL $query->execute; my ($max) = $query->fetchrow_array(); print "Highest article id is $max\n"; my $drop = $handle->prepare(<<SQL); DROP TABLE interlinks SQL $drop->execute; my $create = $handle->prepare(<<SQL); CREATE TABLE interlinks ( counter INT(8) UNSIGNED, cur_id INT(8) UNSIGNED, cur_title VARCHAR(255) BINARY, $target_languages[0] VARCHAR(255) BINARY, $target_languages[1] VARCHAR(255) BINARY, $target_languages[2] VARCHAR(255) BINARY, $target_languages[3] VARCHAR(255) BINARY, PRIMARY KEY (counter)) SQL $create->execute; $query = $handle->prepare(<<SQL); SELECT cur_id, cur_title, cur_text FROM articles WHERE counter >= ? AND counter < ? SQL $ins = $handle->prepare(<<SQL); INSERT INTO interlinks (counter, cur_id, cur_title, $target_languages[0], $target_languages[1], $target_languages[2], $target_languages[3]) VALUES (?, ?, ?, ?, ?, ?, ?) SQL $max++; # increment max to have the last row included (if max mod step == 0) my $counter = 0; for (my $low = 0; $low < $max; $low += $step) { my $high = $low + $step; print "Query from id $low to $high: "; $query->execute($low, $high) or die; my $no_counter = 0; while (my @row = $query->fetchrow_array()) { my ($id, $title, $text) = @row; $title =~ s/\_/ /; # change underscore to whitespace ($l1, $l2, $l3, $l4) = @target_languages; my %trans; for my $l (@target_languages) { if ( $text =~ /\[\[$l:([^\]^\[]+)\]\]/ ) { my $translation = $1; $translation =~ s/_/ /; $trans{$l} = $translation; } } if (%trans) { $ins->execute($counter++, $id, $title, $trans{$l1}, $trans{$l2}, $trans{$l3}, $trans{$l4}) or warn; } else { $no_counter++; } } print "Dropped $no_counter articles without translation, "; print "now $counter interwiki link sets in the database\n"; }