Hi all, I am a newbie dealing with memory leaks. Recently I have written a web scraping program which tends to eat up large chunks of my memory. Its possibly due to a memory leak but couldnt figure out where the possible leak is...
here is my code:
require 'rubygems'
require 'mechanize'
require 'fileutils'
require 'timeout'
class Scrape
attr_accessor :agent
def initialize
@agent = WWW::Mechanize.new
end
def fetch_links(start, finish, count)
page = WWW::Mechanize::Page
while start < finish
begin
File.open('log.txt', 'a') do |log|
log.puts "Trying to fetch #{count} links from #{start} to #{start+count}"
end
page_links = []
Timeout::timeout(10) do
page = agent.get("http://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent&datea=&dateb=&company=&type=&SIC=&State=&CIK=&owner=only&accno=&start=#{start}&count=#{count}")
end
page.links.each {|link|
page_links << "http://www.sec.gov"+link.href if link.href.match(/^\/Archives\/((\w+)\/)+((\w|-\w)+)\.(html|htm)$/)
}
page = nil
page_links.each {|html_link|
begin
download_file(html_link) # if !@@html_links.include?(html_link)
rescue Timeout::Error
File.open('log.txt', 'a') do |log|
log.puts "Request Timeout while following "+html_link
end
next
rescue SocketError, Errno::EINTR , Errno::EINVAL, Errno::ECONNRESET, EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError
File.open('log.txt', 'a') do |log|
log.puts "Network/Unknown Error while following "+html_link
end
next
end
}
page_links = nil
start += count
rescue SocketError, Errno::EINTR , Errno::EINVAL, Errno::ECONNRESET, EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError
File.open('log.txt', 'a') do |log|
log.puts "Bad Network!! Trying to fetch next #{count} links from #{start} to #{start+count}"
end
start += count
start = 0 if start == finish
sleep(2)
rescue Timeout::Error
File.open('log.txt', 'a') do |log|
log.puts "Request Timeout!! Trying to fetch next #{count} links from #{start} to #{start+count}"
end
start += count
start = 0 if start == finish
sleep(2)
end
end
end
def download_file(html_link)
Timeout::timeout(10) do
page = agent.get(html_link)
doc = Hpricot(page.content)
file_name = doc.search("//b[@class='blue']")[5].inner_html
doc = nil
file_name = file_name.split(' ').join('_')
file_name = file_name.split(':').join('-')
file_name.concat('.xml')
page.links.each do |xml_link|
if xml_link.text.match(/\.xml$/)
tmp_arr = xml_link.href.split('/Archives/edgar/data/')[1].split('/')
tmp_dir_path = (tmp_arr - tmp_arr.last.to_a).join('/')
dir_path = "Archives/#{tmp_dir_path}"
file_path = dir_path+"/"+file_name
FileUtils.mkdir_p(dir_path)
if File.exists?(file_path)
# file already exists
else
# puts "downloading file"
File.open(file_path, "w") {|f| f.write(xml_link.click.content)}
end
#@@html_links << html_link
end
end
file_name = nil
page = nil
end
end
end
flag = true
while flag
scrape = Scrape.new
start_time = Time.now
puts "starting..."
File.open('log.txt', 'a') do |log|
log.puts "started at " + start_time.strftime("%Y-%m-%d %H-%M-%S")
end
scrape.fetch_links(0, 1000, 10)
scrape = nil
end_time = Time.now
File.open('log.txt', 'a') do |log|
log.puts "ended at " + end_time.strftime("%Y-%m-%d %H-%M-%S")
for i in 1..3
log.puts "\n"
end
end
puts "completed..sleeping for a while..."
sleep(10)
File.open('log.txt', 'a') do |log|
log.puts "starting again....."
end
puts "starting again..."
end
anybody help me figure out the mem leak...any help greatly appreciated..
regards,
venkat
|