#!/usr/bin/env python # # gcget -- screen scrape Geocaching.com's annoying web interface # aka SHOW ME THE CACHE!!! # # Copyright 2007, Evan Battaglia # Distributed under the terms of the GPL v2. # # # requires module mechanize # # DEFAULT USERNAME AND PASSWORD: THESE ARE OVERWRITTEN BY COMMAND-LINE OPTIONS USER="username" PASS="password" # docs needed! # this has some extra args in: # gcget lat,lon maxnumgcs [maxdist] [threshold] # threshold -- if find more than this # of geocaches, don't get ANY, # instead give warning and quit import sys import getopt def help(): print """gcget v0.1 This program is free software, distributed under the terms of the GNU GPL v2. Usage: gcget [-u username] [-p password] lat,lon maxnumberofgcs [maxdistance] [threshold] Downloads up to maxnumberofgcs at a distance of up to maxdistance from lat,lon. If we number of geocaches within maxdistance is above threshold, don't download any geocaches, just give a warning and quit. If username and password are not given, will use default values hard-coded in script. Happy caching!!! """ # # PARSE OPTIONS: USERNAME AND PASSWORD # # # try: opts, args = getopt.gnu_getopt(sys.argv[1:], "u:p:d", ["help"]) except getopt.GetoptError: # print help information and exit: help() sys.exit(2) DEBUG = False for o, a in opts: if o == "-p": PASS = a if o == "-u": USER = a if o == "--help" or o == "-h": help() sys.exit() if o == "-d": DEBUG = True if len(args) < 2: help() sys.exit() ######################### ll = args[0].split(",") lat = ll[0] lon = ll[1] if len(args) >= 3: maxdist = args[2] else: maxdist = "999" if len(args) >= 4: threshold = int(args[3]) else: threshold = 1000000; # rounds up to multiples of 20. 20 n = int((int(args[1])+19)/20) import re from mechanize import Browser import ClientForm # get magic number for "Next" button. # this is normally 16 (link hidden is $ctl16), unless there are less than 10 pages of results, # in which case it will be less (e.g. 09 for 3 pages of results) def getmagicnumber(b): for i in range(16,0,-1): if re.compile("pgrBottom.ctl%02d" % i).search(b.response().get_data()): return i return None b=Browser() b.open("http://geocaching.com/seek/") b.follow_link(text="Log in") b.select_form(nr=0) b["myUsername"] = USER b["myPassword"] = PASS b.submit() magicnumber = 0 # the ctl number of Next. get only once try: b.select_form("form4") except: pass try: b.select_form("form4") except: b.select_form("form4") print >> sys.stderr, "Invalid username/password" if DEBUG: f=open("gcget.badlogin.html","w") f.write(b.response().get_data()) f.close() print >> sys.stderr, "Dumping last HTML page recieved into gcget.badlogin.html" sys.exit() b["origin_lat"] = lat b["origin_long"] = lon b["dist"] = maxdist b.submit() thresholdre = re.compile("Total Records: ([0-9]*)") m = thresholdre.search(b.response().get_data()) if m: if int(m.group(1)) > threshold: sys.stderr.write("THRESHOLD %d > %d\n" % (int(m.group(1)), threshold)) sys.exit(4) else: records = int(m.group(1)) sys.stderr.write("ok found %d, getting min(%d,%d) gcs\n" % (int(m.group(1)), int(records), int(args[1]))) else: print "can't find total records" sys.exit(0) pages = 0 # (records+19)/20 is the max pages for ii in range(min(n,(records+19)/20)): try: b.select_form(nr=0) b['CID'] = [i.name for i in b.find_control('CID').items] b.submit() except: break # only print one header, start of xml file lines = b.response().get_data().split("\n") if ii == 0: print "\n".join(lines[0:2]) # core print "\n".join(lines[2:-1]) print "" pages += 1 sys.stderr.write("i") sys.stderr.flush() b.back() if not magicnumber: magicnumber = getmagicnumber(b) if not magicnumber: # print "couldn't find magic number!" # why does this happen? break b.select_form(nr=0) [f for f in b.forms()][0].new_control("hidden", "ctl00$ContentBody$pgrBottom$ctl%02d" % magicnumber, {}) b.submit() sys.stderr.write("\n") if pages: print "" # f=open("delmeNOW","w") # f.write(b.response().get_data()) # f.close()