#!/usr/bin/env python
"""
Author:  Matt Weber
Date:    3/17/2007

Submits a search string to google using a random
User-Agent to find WordPress blogs comment pages.
Defaults to finding 100 links, can be set to
find more every 60 seconds
"""

import sys
import urllib2
import re, time, random

if __name__ == '__main__':
  # list of user agents to use
  agents = ['Mozilla/5.0 (Macintosh; U; Intel Mac OS X; en) AppleWebKit/418.9.1 (KHTML, like Gecko) Safari/419.3',
            'Mozilla/4.0 (compatible; MSIE 6.0; Windows XP)',
            'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2) Gecko/20070219 Firefox/2.0.0.2',
            'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
            'Mozilla/4.0 (compatible; MSIE 6.1; Windows XP)',
            'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.1.2) Gecko/20070219 Firefox/2.0.0.2',
            'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.2) Gecko/20060601 Firefox/2.0.0.2 (Ubuntu-edgy)',
            'Opera/9.01 (Windows NT 5.1; U; cs)']
 
  # the header to use when connecting to the page
  header = { 'User-Agent': '',
             'Accept-Language': 'en-us',
             'Keep-Alive': '300',
             'Connection': 'keep-alive',
             'Cache-Control': 'max-age=0' }

  # the search url
  search_url = 'http://www.google.com/search?q=%22Leave+a+Reply%22+Name+Mail+Website+%22proudly+powered+by+WordPress%22&num=100&hl=en&safe=off&filter=0'
  
  # url list, the start index, and the max index
  page_urls = []
  start = 0
  
  # check if passed max from command line
  try:
    max = sys.argv[1]
  except:
    mac = 100

  # loop forever
  while True:
    # creat the final search url
    search_url = search_url + ('&start=%d&sa=N' % start)
    
    # assign a random user agent to trick google
    header['User-Agent'] = random.choice(agents)
    try:
      # try to get the html
      request = urllib2.Request(search_url, None, header)
      response = urllib2.urlopen(request)
      html = response.read()
    except:
      time.sleep(30)
      continue

    # gather all the links from the page
    page_urls_re = re.compile('''["']http://[^+]*?['"]''')
    urls = page_urls_re.findall(html)
    for url in urls[1:]:
        url = url[1:-1]
        page_urls.append(url)
    
    # check if we are on last page or have hit our max count yet
    # exit loop if we have
    start = start + 100
    final_page = re.compile('''\d{1,3}[,\d{3},*]*</b> of <b>\d{1,3}[,\d{3},*]*''')
    if final_page.search(html) or start == max:
      break

    # pause a minute before we get the next page
    # google blocks you if you do to many searches
    # in a specific amount of time
    time.sleep(10)
   
  # list all the urls
  for page in page_urls:
    print page
