User talk:Tapin/CategorizationBot.rb
From Geohashing
< User talk:Tapin
Revision as of 04:59, 11 June 2008 by imported>Tapin (Try again...)
#!/usr/bin/ruby require 'uri' require 'net/http' require 'rexml/document' class XkcdUrl attr_accessor :host attr_accessor :path attr_accessor :queryParams def initialize(host = 'wiki.xkcd.com', path = '/wgh/api.php') @host = host @path = path end def buildUrl if !@queryParams.nil? queryAry = [] queryParams.each_pair{|k, v| queryAry.push "#{k}=#{v}"} url = URI::HTTP.build({:host => @host, :path => @path, :query => queryAry.join('&')}) else url = URI::HTTP.build({:host => @host, :path => @path}) end return url end end # a) Class for category retrieval class MeetupOn attr_accessor :date #TODO: Need parent w/host,path etc def initialize(date = "2008-06-10") #TODO: Change this to default to today @date = date end def getUrl end end # b) Class for category member retrieval # c) Class for images linked from (b) def getMeetupCategories wiki = XkcdUrl.new wiki.queryParams = {} wiki.queryParams["action"] = "query" wiki.queryParams["list"] = "allcategories" wiki.queryParams["acprefix"] = "Meetup_on" wiki.queryParams["prop"] = "info" wiki.queryParams["format"] = "xml" wiki.queryParams["aclimit"] = "500" #TODO: This will only retrieve the first 500. We need to be able to page through as # necessary, using the acfrom parameter # (taken from prev result xpath api/query-continue/allcategories, I think) url = wiki.buildUrl r = Net::HTTP.get_response(url) if (r.code != "200") return [] end cats = [] xml = REXML::Document.new r.body xml.elements.each("api/query/allcategories/c") { |elt| cats.push elt.get_text.to_s } return cats end def getPagesFromCategory(category) wiki = XkcdUrl.new wiki.queryParams = {} wiki.queryParams["action"] = "query" wiki.queryParams["list"] = "categorymembers" wiki.queryParams["cmtitle"] = "Category:" + category.to_s.tr(' ', '_') wiki.queryParams["cmlimit"] = "500" wiki.queryParams["format"] = "xml" wiki.queryParams["cmnamespace"] = 0 # Only get real pages -- 6 is 'Image:' url = wiki.buildUrl r = Net::HTTP.get_response(url) if (r.code != "200") return [] end pages = [] xml = REXML::Document.new r.body xml.elements.each("api/query/categorymembers/cm") { |elt| pages.push elt.attributes['title'].to_s } return pages end def getImagesFromMeetup(meetup) wiki = XkcdUrl.new wiki.queryParams = {} wiki.queryParams["action"] = "query" wiki.queryParams["titles"] = meetup.to_s.tr(' ', '_') wiki.queryParams["prop"] = "images" wiki.queryParams["format"] = "xml" url = wiki.buildUrl r = Net::HTTP.get_response(url) if (r.code != "200") return [] end images = [] xml = REXML::Document.new r.body xml.elements.each("api/query/pages/page/images/im") { |elt| images.push elt.attributes['title'].to_s } return images end def getCategoriesFromImage(image) wiki = XkcdUrl.new wiki.queryParams = {} wiki.queryParams["action"] = "query" wiki.queryParams["titles"] = image.to_s.tr(' ', '_') wiki.queryParams["prop"] = "categories" wiki.queryParams["format"] = "xml" url = wiki.buildUrl r = Net::HTTP.get_response(url) if (r.code != "200") return [] end cats = [] xml = REXML::Document.new r.body xml.elements.each("api/query/pages/page/categories/cl") { |elt| cats.push elt.attributes['title'].to_s } return cats end def main cats = getMeetupCategories sleep 1 cats.each do |cat| puts "Category:#{cat}" pages = getPagesFromCategory(cat) sleep 1 pages.each do |page| puts " #{page}" images = getImagesFromMeetup(page) sleep 1 images.each do |image| puts " #{image}" imageCats = getCategoriesFromImage(image) sleep 1 if imageCats.member? "Category:#{cat}" puts "Found an image that _is_ categorized: #{image} in #{cat}" end end end end end main