blob: 4adf0ca163a3f386a7f31a2e7e0bdcaa3945b63a [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
require 'rubygems'
require 'mechanize'
require 'anemone'
require 'pandoc-ruby'
# require 'json/ext'
TO_REMOVE = [ 'div.toc-filter-back-to-top',
'.rate-yesno-title',
'colgroup',
'div.rate-widget',
'div.toc-filter.toc-filter-bullet' ]
puts "Crawling..."
urls = []
Anemone.crawl("http://apigee.com/docs/app_services", :skip_query_strings => true) do |anemone|
# anemone.on_every_page {|page| puts page.url}
# anemone.skip_links_like(/https?\:\/\/apigee.com\/docs\/(comment|node|api-platform|console|ja|enterprise|consoletogo)/)
anemone.focus_crawl { |page| page.links.select{|l| l.to_s.match(/https?\:\/\/apigee.com\/docs\/(app-services|geode)\/content/) } }
anemone.on_pages_like(/https?\:\/\/apigee.com\/docs\/(app-services|geode)\/content/) do |page|
urls.push page.url
# puts "Found #{page.url}"
end
# anemone.after_crawl { }
end
urls = urls.compact.map{|u| u.to_s}.uniq.sort
puts "Found #{urls.size} documentation articles"
puts urls.join("\n")
gets
a = Mechanize.new { |agent|
agent.user_agent_alias = 'Mac Safari'
}
urls.each do |url|
name = url.split('/')[-1]
puts "Processing #{name}"
begin
a.get(url) do |article|
# title = article.search('h1').first
body = article.search('section#block-system-main>div.node>div.field-name-body').first
next if body.nil?
# body.children.first.add_previous_sibling(title)
# body.search('br').each {|l| l.remove}
body.search(TO_REMOVE.join(', ')).each {|l| l.remove}
body.search('div#collapse').each do |div|
div.add_next_sibling '<a id="'+div.attributes['id'].value+'"></a>'
div.remove
end
body.search('h2').each {|h| h.remove_attribute('class')}
body.search('*').each{|n| n.remove_attribute('style')}
body.search("a").each do |link|
begin
link.attributes["href"].value = link.attributes["href"].value.gsub(/^\/docs\/app-services\/content\//,'/')
rescue
end
end
markdown = PandocRuby.convert(body, :from => :html, :to => :markdown)
front_matter = "---\ntitle: #{title.inner_html.gsub(':',' - ')}\ncategory: \nlayout: article\n---\n\n"
markdown.gsub!('Apigee App Services', 'Apache Usergrid')
markdown.gsub!('App Services', 'Apache Usergrid')
markdown.insert(0,front_matter)
today = Time.new.strftime('%Y-%m-%d')
File.open("../content/docs/#{today}-#{name}.md", 'w') {|f| f.write(markdown) }
end
rescue Exception => e
puts e
end
end