Difference between revisions of "User talk:Tapin/CategorizationBot.rb"
From Geohashing
imported>Tapin (Whoops. Not what I meant.) |
imported>Tapin (Try again...) |
||
Line 1: | Line 1: | ||
− | {{ | + | <pre> |
+ | #!/usr/bin/ruby | ||
+ | |||
+ | require 'uri' | ||
+ | require 'net/http' | ||
+ | require 'rexml/document' | ||
+ | |||
+ | class XkcdUrl | ||
+ | attr_accessor :host | ||
+ | attr_accessor :path | ||
+ | attr_accessor :queryParams | ||
+ | |||
+ | def initialize(host = 'wiki.xkcd.com', path = '/wgh/api.php') | ||
+ | @host = host | ||
+ | @path = path | ||
+ | end | ||
+ | |||
+ | def buildUrl | ||
+ | if !@queryParams.nil? | ||
+ | queryAry = [] | ||
+ | queryParams.each_pair{|k, v| queryAry.push "#{k}=#{v}"} | ||
+ | url = URI::HTTP.build({:host => @host, :path => @path, :query => queryAry.join('&')}) | ||
+ | else | ||
+ | url = URI::HTTP.build({:host => @host, :path => @path}) | ||
+ | end | ||
+ | |||
+ | return url | ||
+ | end | ||
+ | end | ||
+ | |||
+ | # a) Class for category retrieval | ||
+ | class MeetupOn | ||
+ | attr_accessor :date | ||
+ | #TODO: Need parent w/host,path etc | ||
+ | |||
+ | def initialize(date = "2008-06-10") #TODO: Change this to default to today | ||
+ | @date = date | ||
+ | end | ||
+ | |||
+ | def getUrl | ||
+ | |||
+ | end | ||
+ | end | ||
+ | |||
+ | # b) Class for category member retrieval | ||
+ | # c) Class for images linked from (b) | ||
+ | |||
+ | def getMeetupCategories | ||
+ | wiki = XkcdUrl.new | ||
+ | wiki.queryParams = {} | ||
+ | wiki.queryParams["action"] = "query" | ||
+ | wiki.queryParams["list"] = "allcategories" | ||
+ | wiki.queryParams["acprefix"] = "Meetup_on" | ||
+ | wiki.queryParams["prop"] = "info" | ||
+ | wiki.queryParams["format"] = "xml" | ||
+ | wiki.queryParams["aclimit"] = "500" | ||
+ | |||
+ | #TODO: This will only retrieve the first 500. We need to be able to page through as | ||
+ | # necessary, using the acfrom parameter | ||
+ | # (taken from prev result xpath api/query-continue/allcategories, I think) | ||
+ | url = wiki.buildUrl | ||
+ | r = Net::HTTP.get_response(url) | ||
+ | if (r.code != "200") | ||
+ | return [] | ||
+ | end | ||
+ | |||
+ | cats = [] | ||
+ | xml = REXML::Document.new r.body | ||
+ | xml.elements.each("api/query/allcategories/c") { |elt| cats.push elt.get_text.to_s } | ||
+ | return cats | ||
+ | end | ||
+ | |||
+ | def getPagesFromCategory(category) | ||
+ | wiki = XkcdUrl.new | ||
+ | wiki.queryParams = {} | ||
+ | wiki.queryParams["action"] = "query" | ||
+ | wiki.queryParams["list"] = "categorymembers" | ||
+ | wiki.queryParams["cmtitle"] = "Category:" + category.to_s.tr(' ', '_') | ||
+ | wiki.queryParams["cmlimit"] = "500" | ||
+ | wiki.queryParams["format"] = "xml" | ||
+ | wiki.queryParams["cmnamespace"] = 0 # Only get real pages -- 6 is 'Image:' | ||
+ | |||
+ | url = wiki.buildUrl | ||
+ | r = Net::HTTP.get_response(url) | ||
+ | if (r.code != "200") | ||
+ | return [] | ||
+ | end | ||
+ | |||
+ | pages = [] | ||
+ | xml = REXML::Document.new r.body | ||
+ | xml.elements.each("api/query/categorymembers/cm") { |elt| pages.push elt.attributes['title'].to_s } | ||
+ | return pages | ||
+ | end | ||
+ | |||
+ | def getImagesFromMeetup(meetup) | ||
+ | wiki = XkcdUrl.new | ||
+ | wiki.queryParams = {} | ||
+ | wiki.queryParams["action"] = "query" | ||
+ | wiki.queryParams["titles"] = meetup.to_s.tr(' ', '_') | ||
+ | wiki.queryParams["prop"] = "images" | ||
+ | wiki.queryParams["format"] = "xml" | ||
+ | |||
+ | url = wiki.buildUrl | ||
+ | r = Net::HTTP.get_response(url) | ||
+ | if (r.code != "200") | ||
+ | return [] | ||
+ | end | ||
+ | |||
+ | images = [] | ||
+ | xml = REXML::Document.new r.body | ||
+ | xml.elements.each("api/query/pages/page/images/im") { |elt| images.push elt.attributes['title'].to_s } | ||
+ | return images | ||
+ | end | ||
+ | |||
+ | def getCategoriesFromImage(image) | ||
+ | wiki = XkcdUrl.new | ||
+ | wiki.queryParams = {} | ||
+ | wiki.queryParams["action"] = "query" | ||
+ | wiki.queryParams["titles"] = image.to_s.tr(' ', '_') | ||
+ | wiki.queryParams["prop"] = "categories" | ||
+ | wiki.queryParams["format"] = "xml" | ||
+ | |||
+ | url = wiki.buildUrl | ||
+ | r = Net::HTTP.get_response(url) | ||
+ | if (r.code != "200") | ||
+ | return [] | ||
+ | end | ||
+ | |||
+ | cats = [] | ||
+ | xml = REXML::Document.new r.body | ||
+ | xml.elements.each("api/query/pages/page/categories/cl") { |elt| cats.push elt.attributes['title'].to_s } | ||
+ | return cats | ||
+ | end | ||
+ | |||
+ | def main | ||
+ | cats = getMeetupCategories | ||
+ | sleep 1 | ||
+ | cats.each do |cat| | ||
+ | puts "Category:#{cat}" | ||
+ | pages = getPagesFromCategory(cat) | ||
+ | sleep 1 | ||
+ | pages.each do |page| | ||
+ | puts " #{page}" | ||
+ | images = getImagesFromMeetup(page) | ||
+ | sleep 1 | ||
+ | images.each do |image| | ||
+ | puts " #{image}" | ||
+ | imageCats = getCategoriesFromImage(image) | ||
+ | sleep 1 | ||
+ | if imageCats.member? "Category:#{cat}" | ||
+ | puts "Found an image that _is_ categorized: #{image} in #{cat}" | ||
+ | end | ||
+ | end | ||
+ | end | ||
+ | end | ||
+ | end | ||
+ | |||
+ | main | ||
+ | </pre> |
Revision as of 04:59, 11 June 2008
#!/usr/bin/ruby require 'uri' require 'net/http' require 'rexml/document' class XkcdUrl attr_accessor :host attr_accessor :path attr_accessor :queryParams def initialize(host = 'wiki.xkcd.com', path = '/wgh/api.php') @host = host @path = path end def buildUrl if !@queryParams.nil? queryAry = [] queryParams.each_pair{|k, v| queryAry.push "#{k}=#{v}"} url = URI::HTTP.build({:host => @host, :path => @path, :query => queryAry.join('&')}) else url = URI::HTTP.build({:host => @host, :path => @path}) end return url end end # a) Class for category retrieval class MeetupOn attr_accessor :date #TODO: Need parent w/host,path etc def initialize(date = "2008-06-10") #TODO: Change this to default to today @date = date end def getUrl end end # b) Class for category member retrieval # c) Class for images linked from (b) def getMeetupCategories wiki = XkcdUrl.new wiki.queryParams = {} wiki.queryParams["action"] = "query" wiki.queryParams["list"] = "allcategories" wiki.queryParams["acprefix"] = "Meetup_on" wiki.queryParams["prop"] = "info" wiki.queryParams["format"] = "xml" wiki.queryParams["aclimit"] = "500" #TODO: This will only retrieve the first 500. We need to be able to page through as # necessary, using the acfrom parameter # (taken from prev result xpath api/query-continue/allcategories, I think) url = wiki.buildUrl r = Net::HTTP.get_response(url) if (r.code != "200") return [] end cats = [] xml = REXML::Document.new r.body xml.elements.each("api/query/allcategories/c") { |elt| cats.push elt.get_text.to_s } return cats end def getPagesFromCategory(category) wiki = XkcdUrl.new wiki.queryParams = {} wiki.queryParams["action"] = "query" wiki.queryParams["list"] = "categorymembers" wiki.queryParams["cmtitle"] = "Category:" + category.to_s.tr(' ', '_') wiki.queryParams["cmlimit"] = "500" wiki.queryParams["format"] = "xml" wiki.queryParams["cmnamespace"] = 0 # Only get real pages -- 6 is 'Image:' url = wiki.buildUrl r = Net::HTTP.get_response(url) if (r.code != "200") return [] end pages = [] xml = REXML::Document.new r.body xml.elements.each("api/query/categorymembers/cm") { |elt| pages.push elt.attributes['title'].to_s } return pages end def getImagesFromMeetup(meetup) wiki = XkcdUrl.new wiki.queryParams = {} wiki.queryParams["action"] = "query" wiki.queryParams["titles"] = meetup.to_s.tr(' ', '_') wiki.queryParams["prop"] = "images" wiki.queryParams["format"] = "xml" url = wiki.buildUrl r = Net::HTTP.get_response(url) if (r.code != "200") return [] end images = [] xml = REXML::Document.new r.body xml.elements.each("api/query/pages/page/images/im") { |elt| images.push elt.attributes['title'].to_s } return images end def getCategoriesFromImage(image) wiki = XkcdUrl.new wiki.queryParams = {} wiki.queryParams["action"] = "query" wiki.queryParams["titles"] = image.to_s.tr(' ', '_') wiki.queryParams["prop"] = "categories" wiki.queryParams["format"] = "xml" url = wiki.buildUrl r = Net::HTTP.get_response(url) if (r.code != "200") return [] end cats = [] xml = REXML::Document.new r.body xml.elements.each("api/query/pages/page/categories/cl") { |elt| cats.push elt.attributes['title'].to_s } return cats end def main cats = getMeetupCategories sleep 1 cats.each do |cat| puts "Category:#{cat}" pages = getPagesFromCategory(cat) sleep 1 pages.each do |page| puts " #{page}" images = getImagesFromMeetup(page) sleep 1 images.each do |image| puts " #{image}" imageCats = getCategoriesFromImage(image) sleep 1 if imageCats.member? "Category:#{cat}" puts "Found an image that _is_ categorized: #{image} in #{cat}" end end end end end main