Difference between revisions of "User talk:Tapin/CategorizationBot.rb"

From Geohashing
imported>Tapin
(first pass, just identifies already-catted images)
 
imported>Tapin
(Whoops. Not what I meant.)
Line 1: Line 1:
#!/usr/bin/ruby
+
{{delete}}
 
 
require 'uri'
 
require 'net/http'
 
require 'rexml/document'
 
 
 
class XkcdUrl
 
  attr_accessor :host
 
  attr_accessor :path
 
  attr_accessor :queryParams
 
 
 
  def initialize(host = 'wiki.xkcd.com', path = '/wgh/api.php')
 
    @host = host
 
    @path = path
 
  end
 
 
 
  def buildUrl
 
    if !@queryParams.nil?
 
      queryAry = []
 
      queryParams.each_pair{|k, v| queryAry.push "#{k}=#{v}"}
 
      url = URI::HTTP.build({:host => @host, :path => @path, :query => queryAry.join('&')})
 
    else
 
      url = URI::HTTP.build({:host => @host, :path => @path})
 
    end
 
   
 
    return url
 
  end
 
end
 
 
 
# a) Class for category retrieval
 
class MeetupOn
 
  attr_accessor :date
 
  #TODO: Need parent w/host,path etc
 
 
 
  def initialize(date = "2008-06-10") #TODO: Change this to default to today
 
    @date = date
 
  end
 
 
 
  def getUrl
 
   
 
  end
 
end
 
 
 
# b) Class for category member retrieval
 
# c) Class for images linked from (b)
 
 
 
def getMeetupCategories
 
  wiki = XkcdUrl.new
 
  wiki.queryParams = {}
 
  wiki.queryParams["action"] = "query"
 
  wiki.queryParams["list"] = "allcategories"
 
  wiki.queryParams["acprefix"] = "Meetup_on"
 
  wiki.queryParams["prop"] = "info"
 
  wiki.queryParams["format"] = "xml"
 
  wiki.queryParams["aclimit"] = "500"
 
 
 
  #TODO: This will only retrieve the first 500.  We need to be able to page through as
 
  # necessary, using the acfrom parameter
 
  # (taken from prev result xpath api/query-continue/allcategories, I think)
 
  url = wiki.buildUrl
 
  r = Net::HTTP.get_response(url)
 
  if (r.code != "200")
 
    return []
 
  end
 
 
 
  cats = []
 
  xml = REXML::Document.new r.body
 
  xml.elements.each("api/query/allcategories/c") { |elt| cats.push elt.get_text.to_s }
 
  return cats
 
end
 
 
 
def getPagesFromCategory(category)
 
  wiki = XkcdUrl.new
 
  wiki.queryParams = {}
 
  wiki.queryParams["action"] = "query"
 
  wiki.queryParams["list"] = "categorymembers"
 
  wiki.queryParams["cmtitle"] = "Category:" + category.to_s.tr(' ', '_')
 
  wiki.queryParams["cmlimit"] = "500" 
 
  wiki.queryParams["format"] = "xml"
 
  wiki.queryParams["cmnamespace"] = 0 # Only get real pages -- 6 is 'Image:'
 
 
 
  url = wiki.buildUrl
 
  r = Net::HTTP.get_response(url)
 
  if (r.code != "200")
 
    return []
 
  end
 
 
 
  pages = []
 
  xml = REXML::Document.new r.body
 
  xml.elements.each("api/query/categorymembers/cm") { |elt| pages.push elt.attributes['title'].to_s }
 
  return pages
 
end
 
 
 
def getImagesFromMeetup(meetup)
 
  wiki = XkcdUrl.new
 
  wiki.queryParams = {}
 
  wiki.queryParams["action"] = "query"
 
  wiki.queryParams["titles"] = meetup.to_s.tr(' ', '_')
 
  wiki.queryParams["prop"] = "images"
 
  wiki.queryParams["format"] = "xml"
 
 
 
  url = wiki.buildUrl
 
  r = Net::HTTP.get_response(url)
 
  if (r.code != "200")
 
    return []
 
  end
 
 
 
  images = []
 
  xml = REXML::Document.new r.body
 
  xml.elements.each("api/query/pages/page/images/im") { |elt| images.push elt.attributes['title'].to_s }
 
  return images
 
end
 
 
 
def getCategoriesFromImage(image)
 
  wiki = XkcdUrl.new
 
  wiki.queryParams = {}
 
  wiki.queryParams["action"] = "query"
 
  wiki.queryParams["titles"] = image.to_s.tr(' ', '_')
 
  wiki.queryParams["prop"] = "categories"
 
  wiki.queryParams["format"] = "xml" 
 
 
 
  url = wiki.buildUrl
 
  r = Net::HTTP.get_response(url)
 
  if (r.code != "200")
 
    return []
 
  end
 
 
 
  cats = []
 
  xml = REXML::Document.new r.body
 
  xml.elements.each("api/query/pages/page/categories/cl") { |elt| cats.push elt.attributes['title'].to_s }
 
  return cats
 
end 
 
 
 
def main
 
  cats = getMeetupCategories
 
  sleep 1
 
  cats.each do |cat|
 
    puts "Category:#{cat}"
 
    pages = getPagesFromCategory(cat)
 
    sleep 1
 
    pages.each do |page|
 
      puts " #{page}"
 
      images = getImagesFromMeetup(page)
 
      sleep 1
 
      images.each do |image|
 
        puts "  #{image}"
 
        imageCats = getCategoriesFromImage(image)
 
        sleep 1
 
        if imageCats.member? "Category:#{cat}"
 
          puts "Found an image that _is_ categorized: #{image} in #{cat}"
 
        end
 
      end
 
    end
 
  end
 
end
 
 
 
main
 

Revision as of 04:58, 11 June 2008

This discussion page has been tagged for deletion. If deleted, only the discussion page will disappear, not the actual page being discussed. If you disagree with its deletion, remove the {{delete}} tag from the page . Otherwise, an administrator should come by and delete it shortly.

You may provide a reason for deletion by using {{delete|reason}}.See the list of pages tagged for deletion.