Forums | Admin

Discussion Forums: help

Start New Thread Start New Thread

 

By: venkat venkat
help needed regarding memory leaks [ reply ]  
2008-03-12 09:43
Hi all, I am a newbie dealing with memory leaks. Recently I have written a web scraping program which tends to eat up large chunks of my memory. Its possibly due to a memory leak but couldnt figure out where the possible leak is...

here is my code:


require 'rubygems'

require 'mechanize'

require 'fileutils'

require 'timeout'



class Scrape



attr_accessor :agent



def initialize

@agent = WWW::Mechanize.new

end



def fetch_links(start, finish, count)

page = WWW::Mechanize::Page
while start < finish

begin

File.open('log.txt', 'a') do |log|

log.puts "Trying to fetch #{count} links from #{start} to #{start+count}"

end

page_links = []

Timeout::timeout(10) do

page = agent.get("http://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent&datea=&dateb=&company=&type=&SIC=&State=&CIK=&owner=only&accno=&start=#{start}&count=#{count}";)

end

page.links.each {|link|

page_links << "http://www.sec.gov"+link.href if link.href.match(/^\/Archives\/((\w+)\/)+((\w|-\w)+)\.(html|htm)$/)

}

page = nil

page_links.each {|html_link|

begin

download_file(html_link) # if !@@html_links.include?(html_link)

rescue Timeout::Error

File.open('log.txt', 'a') do |log|

log.puts "Request Timeout while following "+html_link

end

next

rescue SocketError, Errno::EINTR , Errno::EINVAL, Errno::ECONNRESET, EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError

File.open('log.txt', 'a') do |log|

log.puts "Network/Unknown Error while following "+html_link

end

next

end

}
page_links = nil

start += count

rescue SocketError, Errno::EINTR , Errno::EINVAL, Errno::ECONNRESET, EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError

File.open('log.txt', 'a') do |log|

log.puts "Bad Network!! Trying to fetch next #{count} links from #{start} to #{start+count}"

end

start += count

start = 0 if start == finish
sleep(2)

rescue Timeout::Error

File.open('log.txt', 'a') do |log|

log.puts "Request Timeout!! Trying to fetch next #{count} links from #{start} to #{start+count}"

end

start += count

start = 0 if start == finish
sleep(2)

end

end

end



def download_file(html_link)

Timeout::timeout(10) do

page = agent.get(html_link)

doc = Hpricot(page.content)

file_name = doc.search("//b[@class='blue']")[5].inner_html

doc = nil

file_name = file_name.split(' ').join('_')

file_name = file_name.split(':').join('-')

file_name.concat('.xml')

page.links.each do |xml_link|

if xml_link.text.match(/\.xml$/)

tmp_arr = xml_link.href.split('/Archives/edgar/data/')[1].split('/')

tmp_dir_path = (tmp_arr - tmp_arr.last.to_a).join('/')

dir_path = "Archives/#{tmp_dir_path}"

file_path = dir_path+"/"+file_name

FileUtils.mkdir_p(dir_path)

if File.exists?(file_path)

# file already exists

else

# puts "downloading file"

File.open(file_path, "w") {|f| f.write(xml_link.click.content)}

end

#@@html_links << html_link

end

end

file_name = nil
page = nil

end


end

end





flag = true



while flag

scrape = Scrape.new
start_time = Time.now

puts "starting..."
File.open('log.txt', 'a') do |log|

log.puts "started at " + start_time.strftime("%Y-%m-%d %H-%M-%S")

end

scrape.fetch_links(0, 1000, 10)

scrape = nil
end_time = Time.now

File.open('log.txt', 'a') do |log|

log.puts "ended at " + end_time.strftime("%Y-%m-%d %H-%M-%S")

for i in 1..3

log.puts "\n"

end

end



puts "completed..sleeping for a while..."
sleep(10)

File.open('log.txt', 'a') do |log|

log.puts "starting again....."

end

puts "starting again..."

end


anybody help me figure out the mem leak...any help greatly appreciated..

regards,
venkat