Hemanth.HM

A Computer Polyglot, CLI + WEB ♥'r.

Web Crawler With Python Twisted

| Comments

Here is a simple HTTP crawler I wrote with python Twsited

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from twisted.web.client import getPage
from twisted.python.util import println
from BeautifulSoup import BeautifulSoup
from twisted.python import log
from twisted.internet import defer, task
import re
# Needs : PyOpenSSL and Twisted 12.3+

def parallel(iterable, count, callable, *args, **named):
    coop = task.Cooperator()
    work = (callable(elem, *args, **named) for elem in iterable)
    return defer.DeferredList([coop.coiterate(work) for i in xrange(count)])


def union(p, q):
    for e in p:
      if e not in q:
        print e
        q.append(e)


def extractLinks(html):
    soup = BeautifulSoup(html)
    soup.prettify()
    return [str(anchor['href']) for anchor in soup.findAll('a',attrs={'href': re.compile("^http://")}) if anchor['href']]

def crawlPage(url, urlList):
    d = getPage(url)
    d.addCallback(extractLinks)
    d.addCallback(union, urlList)
    d.addErrback(log.err)
    return d


def crawler(urls):
    urls = list(urls)


def main(reactor, *args):
    urls = list(args)
    return parallel(urls,len(urls), crawlPage, urls)


if __name__ == '__main__':
    import sys
    task.react(main,["http://h3manth.com","http://www.test.com"]) # Can pass a list of urls

Here is the non-twisted version :

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""This is a liner implementation of a simple HTTP crawler.

This is crawler crawlers a given URL till a specified limit,
or till limit tends to infinity.

TODO :
1. import robotparser and parse robots.txt
2. Write the URL to DB using sqllite.
3. Content type validation using response.info().headers
"""
import urllib2
import socket
from lxml.html import parse
import argparse
import sys
import re
socket.setdefaulttimeout(10)


class Spidy:
  """Main spider class, public method crawl"""

  def __init__(self, url):
       self.seed = url
       self.failed = []
       self.crawled = []

  def __union(self, p, q):
      """list(set(a) | set(b))"""
      for e in q:
          if e not in p:
              p.append(e)

  def __extractLinks(self, page):
        """ Extract hrefs """
        dom = parse(page).getroot()
        dom.make_links_absolute()
        links = dom.cssselect('a')
        return [link.get('href') for link in links if link.get('href')]

  def crawl(self, limit=float('inf')):
      """ Crawls the webpage,
          optional param limit.
      """
      tocrawl = [self.seed]
      while tocrawl and len(self.crawled) < limit:
          page = tocrawl.pop()
          print page   # Printing as of now for redirection.
          if page not in self.crawled:
              try:
                self.__union(tocrawl, self.__extractLinks(page))
                self.crawled.append(page)
              except Exception as e:
                print e
                self.failed.append([page, e])   # Failed! write to DB.
                pass
      return self.crawled

if __name__ == "__main__":
  parser = argparse.ArgumentParser(description='Spidy a simple web crawler')
  parser.add_argument('-u', '--url',  help='URL to crawl',required=True)
  parser.add_argument('-l', '--limit', help='Crawlling limit', required=False)
  args  = parser.parse_args()
  url   = args.url
  limit = args.limit
  if re.match("^https?://", url):
    try:
      urllib2.urlopen(url)
    except IOError:
      print "Not a real URL"
      sys.exit(0)
  else:
    print "Sorry only http or https urls are accepted as of now"
    sys.exit(0)
  if not url.endswith("/"):
    url+="/"  # Needs a trailing slash.
  spider = Spidy(url)
  spider.crawl() if limit==None else spider.crawl(limit)

Comments