07.17Многопоточный чекер проиндексированных страниц Яндекс,Google
Поддерживаемые выдачи: Яндекс, Гугл, Гугл за 24 часа, Гугл за неделю, Гугл за месяц.
Яндекс можно чекать без прокси, примерная скорость без прокси при 100 потоках 10 ссылок в секунду.
# coding:utf-8 # author: Rushter # site: http://klipner.ru from grab import Grab import re,time,random import threading,Queue ################################# useProxy = False yandexq = True googlea = False googled = False googlew = False googlem = False googleMain = False threads = 100 proxyFile = 'proxy.txt' proxyType = 'http' proxyCheckUrl = 'http://google.com' urls = 'urls.txt' result = 'result.txt' separator = '|' timeout = 150 try_count = 3 ################################## queue = Queue.Queue() lock = threading.RLock() proxy = open(proxyFile).readlines() urlsList = open(urls).readlines() types = ['','&tbs=qdr:d','&tbs=qdr:w','&tbs=qdr:m'] t1 = time.time() def getProxy(): if useProxy: proxyType=None g = Grab() for _ in xrange(try_count): proxyq = random.choice(proxy).strip() g.setup(proxy=proxyq, proxy_type=proxyType, url=proxyCheckUrl) try: g.request() return proxyq except: pass return None else: return None def write(data): lock.acquire() d = open(result,'a+') d.write(data+'\n') d.close() lock.release() def worker(): global queue while True: try: target = queue.get_nowait() except Queue.Empty: return check(target) def check(site): result = site+'|' if googlea: result += google(site,0)+separator if googled: result += google(site,1)+separator if googlew: result += google(site,2)+separator if googlem: result += google(site,3)+separator if yandexq: result += yandex(site) if googleMain: if googlea: result += google(site,0,True)+separator if googled: result += google(site,1,True)+separator if googlew: result += google(site,2,True)+separator if googlem: result += google(site,3,True)+separator write(result) def google(site,type,main=False): g = Grab() if useProxy: g.setup(proxy=getProxy(),proxy_type=proxyType) if main: site = site+'/&' url='http://www.google.com/search?hl=en&q=site:%s&aq=f&aqi=&aql=&oq=&gs_rfai='%(site)+types[type] for _ in xrange(try_count): g.setup(url=url,timeout=timeout) g.request() data = g.response_body if data.find('did not match any')>0: count=list('0') else: count = re.findall('resultStats.*?([0-9,]{1,10}).*?nobr',data) if len(count)!=0: count = count[0].replace(',','') return count return 'err' def yandex(site): g = Grab() if useProxy: g.setup(proxy=getProxy(),proxy_type=proxyType) url = 'http://search.qip.ru/search?query=site:%s&lr=1'%(site) for _ in xrange(try_count): g.setup(url=url,timeout=timeout) g.request() data = g.response_body count = re.findall('найден.{0,2}<br /> (.*?) страниц',data) if len(count)!=0: count = count[0].replace('миллион','000000').replace('тысяч','000') return count return 'err' def main(): d = open(result,'w') d.write('Site'+separator) if googlea: d.write('Google '+separator) if googled: d.write('Google day '+separator) if googlew: d.write('Google week '+separator) if googlem: d.write('Google month '+separator) if yandexq: d.write('Yandex '+separator) if googleMain: if googlea: d.write('MGoogle '+separator) if googled: d.write('MGoogle day '+separator) if googlew: d.write('MGoogle week '+separator) if googlem: d.write('MGoogle month '+separator) d.write('\n') d.close() print "Check started" for u in urlsList: queue.put(u.strip()) for _ in xrange(threads): thread_ = threading.Thread(target=worker) thread_.start() while threading.active_count() >1: time.sleep(1) print queue.qsize() print "Check ended" print str(time.time()-t1) main()
-
Евгений
-
http://klipner.ru rushter