User talk:Tapin/CategorizationBot.rb

From Geohashing
< User talk:Tapin
Revision as of 04:57, 11 June 2008 by imported>Tapin (first pass, just identifies already-catted images)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
  1. !/usr/bin/ruby

require 'uri' require 'net/http' require 'rexml/document'

class XkcdUrl

 attr_accessor :host
 attr_accessor :path
 attr_accessor :queryParams
 
 def initialize(host = 'wiki.xkcd.com', path = '/wgh/api.php')
   @host = host
   @path = path
 end
 def buildUrl
   if !@queryParams.nil?
     queryAry = []
     queryParams.each_pair{|k, v| queryAry.push "#{k}=#{v}"}
     url = URI::HTTP.build({:host => @host, :path => @path, :query => queryAry.join('&')})
   else
     url = URI::HTTP.build({:host => @host, :path => @path})
   end
   
   return url
 end

end

  1. a) Class for category retrieval

class MeetupOn

 attr_accessor :date
 #TODO: Need parent w/host,path etc
 
 def initialize(date = "2008-06-10") #TODO: Change this to default to today
   @date = date
 end
 def getUrl
   
 end

end

  1. b) Class for category member retrieval
  2. c) Class for images linked from (b)

def getMeetupCategories

 wiki = XkcdUrl.new
 wiki.queryParams = {}
 wiki.queryParams["action"] = "query"
 wiki.queryParams["list"] = "allcategories"
 wiki.queryParams["acprefix"] = "Meetup_on"
 wiki.queryParams["prop"] = "info"
 wiki.queryParams["format"] = "xml"
 wiki.queryParams["aclimit"] = "500"
 #TODO: This will only retrieve the first 500.  We need to be able to page through as
 # necessary, using the acfrom parameter
 # (taken from prev result xpath api/query-continue/allcategories, I think)
 url = wiki.buildUrl
 r = Net::HTTP.get_response(url)
 if (r.code != "200")
   return []
 end
 cats = []
 xml = REXML::Document.new r.body
 xml.elements.each("api/query/allcategories/c") { |elt| cats.push elt.get_text.to_s }
 return cats

end

def getPagesFromCategory(category)

 wiki = XkcdUrl.new
 wiki.queryParams = {}
 wiki.queryParams["action"] = "query"
 wiki.queryParams["list"] = "categorymembers"
 wiki.queryParams["cmtitle"] = "Category:" + category.to_s.tr(' ', '_')
 wiki.queryParams["cmlimit"] = "500"  
 wiki.queryParams["format"] = "xml"
 wiki.queryParams["cmnamespace"] = 0 # Only get real pages -- 6 is 'Image:'
 
 url = wiki.buildUrl
 r = Net::HTTP.get_response(url)
 if (r.code != "200")
   return []
 end
 pages = []
 xml = REXML::Document.new r.body
 xml.elements.each("api/query/categorymembers/cm") { |elt| pages.push elt.attributes['title'].to_s }
 return pages

end

def getImagesFromMeetup(meetup)

 wiki = XkcdUrl.new
 wiki.queryParams = {}
 wiki.queryParams["action"] = "query"
 wiki.queryParams["titles"] = meetup.to_s.tr(' ', '_')
 wiki.queryParams["prop"] = "images"
 wiki.queryParams["format"] = "xml"
 url = wiki.buildUrl
 r = Net::HTTP.get_response(url)
 if (r.code != "200")
   return []
 end
 images = []
 xml = REXML::Document.new r.body
 xml.elements.each("api/query/pages/page/images/im") { |elt| images.push elt.attributes['title'].to_s }
 return images

end

def getCategoriesFromImage(image)

 wiki = XkcdUrl.new
 wiki.queryParams = {}
 wiki.queryParams["action"] = "query"
 wiki.queryParams["titles"] = image.to_s.tr(' ', '_')
 wiki.queryParams["prop"] = "categories"
 wiki.queryParams["format"] = "xml"  
 
 url = wiki.buildUrl
 r = Net::HTTP.get_response(url)
 if (r.code != "200")
   return []
 end
 cats = []
 xml = REXML::Document.new r.body
 xml.elements.each("api/query/pages/page/categories/cl") { |elt| cats.push elt.attributes['title'].to_s }
 return cats

end

def main

 cats = getMeetupCategories
 sleep 1
 cats.each do |cat|
   puts "Category:#{cat}"
   pages = getPagesFromCategory(cat)
   sleep 1
   pages.each do |page|
     puts " #{page}"
     images = getImagesFromMeetup(page)
     sleep 1
     images.each do |image|
       puts "  #{image}"
       imageCats = getCategoriesFromImage(image)
       sleep 1
       if imageCats.member? "Category:#{cat}"
         puts "Found an image that _is_ categorized: #{image} in #{cat}"
       end
     end
   end
 end

end

main