User:Hillgentleman/ifexistslogsort.py

#ifexistsLogSorts.py
# TO SORT THE FILE ifexists.log.1 AND EXTRACT lines like:
#          2007-12-03 06:27:16 zhwiki: 131 http://zh.wikipedia.org/wiki/%E6%B4%9B%E7%A3%AF%E5%B1%B1%E8%84%88
#FROM  http://noc.wikimedia.org/~tstarling/ifexist.log

lang='sw'
print 'ifexistsLogSorts.py - choose language'
print 'lang='+lang
x=raw_input('language?(sw)')
if x!='': lang=x

dataFileName='ifexists.'+lang
x=raw_input('dataFileName? '+dataFileName)
if x!='': dataFileName=x

print dataFileName

a = ''

import re
import codecs
#import urllib
#import time


## REGEX
urlX = re.compile(r'http\://'+ lang + r'.+\b',flags=re.U)
andX= re.compile(r'&.*\b',flags=re.U) #REMOVE &variant=zh-tw blablabla TAIL
shorturlX=re.compile(r'w/index\.php\?title=',flags=re.U) #CONVERT TO SHORTURL


#file = urllib.urlopen('http://noc.wikimedia.org/~tstarling/ifexist.log')
logFile = codecs.open('ifexists.log.1','a+',encoding='utf-8')
dataFile = codecs.open(dataFileName,'w',encoding='utf-8')

x=logFile.read()

list = urlX.findall(x)


"""
n=0
for i in list:
  print(i)
  n+=1
print n, 'parses found.'
"""

s = set(list)
"""n=0
for i in s:
 print i
 n+=1
print n, 'pages found.'
"""

a = raw_input('haha\n\n\n\n\n\a')

## TREATING THE URLS AND HOPEFULLY FURTHER REDUCING THE SET

s1=set([])
for i in s:
 i = andX.sub('',i) #REMOVE THE &... TAIL
 i = shorturlX.sub('wiki/',i) #REPLACE BY SHORTURL 
 #print i
 s1.add(i)

n=0
for i in s1:
  if a=='': a=raw_input(i+'\npress return to continue, something else to automate')
  n+=1
  dataFile.write(i+'\n')

print n,'urls in total.'
dataFile.close()


print 'dataFile IS ', dataFileName


### CRAP - UNLESS YOU HAVE VERY LITTLE MEMORY
"""

try:
  while True:
    x = file.read(100000)
    print x

    if a=='': a=raw_input('press Return to continue to wait, or press some other key to automatise')

    saveFile.write(x)
    
    currentTime=time.clock()
    while time.clock()< currentTime+10:
      print'..'

finally:
    saveFile.close()

"""
#