User talk:Tapin/CategorizationBot.rb
From Geohashing
< User talk:Tapin
Revision as of 04:57, 11 June 2008 by imported>Tapin (first pass, just identifies already-catted images)
- !/usr/bin/ruby
require 'uri' require 'net/http' require 'rexml/document'
class XkcdUrl
attr_accessor :host attr_accessor :path attr_accessor :queryParams def initialize(host = 'wiki.xkcd.com', path = '/wgh/api.php') @host = host @path = path end
def buildUrl if !@queryParams.nil? queryAry = [] queryParams.each_pair{|k, v| queryAry.push "#{k}=#{v}"} url = URI::HTTP.build({:host => @host, :path => @path, :query => queryAry.join('&')}) else url = URI::HTTP.build({:host => @host, :path => @path}) end return url end
end
- a) Class for category retrieval
class MeetupOn
attr_accessor :date #TODO: Need parent w/host,path etc def initialize(date = "2008-06-10") #TODO: Change this to default to today @date = date end
def getUrl end
end
- b) Class for category member retrieval
- c) Class for images linked from (b)
def getMeetupCategories
wiki = XkcdUrl.new wiki.queryParams = {} wiki.queryParams["action"] = "query" wiki.queryParams["list"] = "allcategories" wiki.queryParams["acprefix"] = "Meetup_on" wiki.queryParams["prop"] = "info" wiki.queryParams["format"] = "xml" wiki.queryParams["aclimit"] = "500"
#TODO: This will only retrieve the first 500. We need to be able to page through as # necessary, using the acfrom parameter # (taken from prev result xpath api/query-continue/allcategories, I think) url = wiki.buildUrl r = Net::HTTP.get_response(url) if (r.code != "200") return [] end
cats = [] xml = REXML::Document.new r.body xml.elements.each("api/query/allcategories/c") { |elt| cats.push elt.get_text.to_s } return cats
end
def getPagesFromCategory(category)
wiki = XkcdUrl.new wiki.queryParams = {} wiki.queryParams["action"] = "query" wiki.queryParams["list"] = "categorymembers" wiki.queryParams["cmtitle"] = "Category:" + category.to_s.tr(' ', '_') wiki.queryParams["cmlimit"] = "500" wiki.queryParams["format"] = "xml" wiki.queryParams["cmnamespace"] = 0 # Only get real pages -- 6 is 'Image:' url = wiki.buildUrl r = Net::HTTP.get_response(url) if (r.code != "200") return [] end
pages = [] xml = REXML::Document.new r.body xml.elements.each("api/query/categorymembers/cm") { |elt| pages.push elt.attributes['title'].to_s } return pages
end
def getImagesFromMeetup(meetup)
wiki = XkcdUrl.new wiki.queryParams = {} wiki.queryParams["action"] = "query" wiki.queryParams["titles"] = meetup.to_s.tr(' ', '_') wiki.queryParams["prop"] = "images" wiki.queryParams["format"] = "xml"
url = wiki.buildUrl r = Net::HTTP.get_response(url) if (r.code != "200") return [] end
images = [] xml = REXML::Document.new r.body xml.elements.each("api/query/pages/page/images/im") { |elt| images.push elt.attributes['title'].to_s } return images
end
def getCategoriesFromImage(image)
wiki = XkcdUrl.new wiki.queryParams = {} wiki.queryParams["action"] = "query" wiki.queryParams["titles"] = image.to_s.tr(' ', '_') wiki.queryParams["prop"] = "categories" wiki.queryParams["format"] = "xml" url = wiki.buildUrl r = Net::HTTP.get_response(url) if (r.code != "200") return [] end
cats = [] xml = REXML::Document.new r.body xml.elements.each("api/query/pages/page/categories/cl") { |elt| cats.push elt.attributes['title'].to_s } return cats
end
def main
cats = getMeetupCategories sleep 1 cats.each do |cat| puts "Category:#{cat}" pages = getPagesFromCategory(cat) sleep 1 pages.each do |page| puts " #{page}" images = getImagesFromMeetup(page) sleep 1 images.each do |image| puts " #{image}" imageCats = getCategoriesFromImage(image) sleep 1 if imageCats.member? "Category:#{cat}" puts "Found an image that _is_ categorized: #{image} in #{cat}" end end end end
end
main