#ifexistsLogSorts.py
# TO SORT THE FILE ifexists.log.1 AND EXTRACT lines like:
# 2007-12-03 06:27:16 zhwiki: 131 http://zh.wikipedia.org/wiki/%E6%B4%9B%E7%A3%AF%E5%B1%B1%E8%84%88
#FROM http://noc.wikimedia.org/~tstarling/ifexist.log
lang='sw'
print 'ifexistsLogSorts.py - choose language'
print 'lang='+lang
x=raw_input('language?(sw)')
if x!='': lang=x
dataFileName='ifexists.'+lang
x=raw_input('dataFileName? '+dataFileName)
if x!='': dataFileName=x
print dataFileName
a = ''
import re
import codecs
#import urllib
#import time
## REGEX
urlX = re.compile(r'http\://'+ lang + r'.+\b',flags=re.U)
andX= re.compile(r'&.*\b',flags=re.U) #REMOVE &variant=zh-tw blablabla TAIL
shorturlX=re.compile(r'w/index\.php\?title=',flags=re.U) #CONVERT TO SHORTURL
#file = urllib.urlopen('http://noc.wikimedia.org/~tstarling/ifexist.log')
logFile = codecs.open('ifexists.log.1','a+',encoding='utf-8')
dataFile = codecs.open(dataFileName,'w',encoding='utf-8')
x=logFile.read()
list = urlX.findall(x)
"""
n=0
for i in list:
print(i)
n+=1
print n, 'parses found.'
"""
s = set(list)
"""n=0
for i in s:
print i
n+=1
print n, 'pages found.'
"""
a = raw_input('haha\n\n\n\n\n\a')
## TREATING THE URLS AND HOPEFULLY FURTHER REDUCING THE SET
s1=set([])
for i in s:
i = andX.sub('',i) #REMOVE THE &... TAIL
i = shorturlX.sub('wiki/',i) #REPLACE BY SHORTURL
#print i
s1.add(i)
n=0
for i in s1:
if a=='': a=raw_input(i+'\npress return to continue, something else to automate')
n+=1
dataFile.write(i+'\n')
print n,'urls in total.'
dataFile.close()
print 'dataFile IS ', dataFileName
### CRAP - UNLESS YOU HAVE VERY LITTLE MEMORY
"""
try:
while True:
x = file.read(100000)
print x
if a=='': a=raw_input('press Return to continue to wait, or press some other key to automatise')
saveFile.write(x)
currentTime=time.clock()
while time.clock()< currentTime+10:
print'..'
finally:
saveFile.close()
"""
#