Problem
This is a scraper written in CoffeeScript for NodeJS. It is run in an interval (set to 5 seconds here to dramtically show the leak). Somewhere this code leaks memory. I already tried nulling a few things for manually deleting references. Where is the reference kept that produces the leak?
request = require 'request'
jsdom = require 'jsdom'
util = require 'util'
scrapeURL = 'http://kwlpls.adiwidjaja.com/index.php'
jqueryUrl = 'http://code.jquery.com/jquery-1.6.1.min.js'
scrapeDivId = "cc-m-externalsource-container-m8a3ae44c30fa9708"
scrapeDelay = 5 * 1000 # 5 seconds
data = []
fetch = (callback) ->
request { uri: scrapeURL }, (error, response, body) ->
if error and response and response.statusCode isnt 200
util.log 'Error when contacting #{scrapeURL}'
# Fake Browser
jsdom.env
html: body,
scripts: [jqueryUrl],
(err, window) ->
processPage(window, (result) ->
callback(err, result)
)
processPage = (window, callback) ->
# cleanup
result = {}
result.cities = []
result.parkings = []
$ = window.jQuery
rows = $('table').children()
num = $(rows).size()
rows.each (i, row) ->
if i > 0 and i isnt num - 1 # cut off header and footer
processRow($, row, (item, city) ->
result.parkings.push(item) if item?
result.cities.push(city) if city?
item = null
city = null
callback(result) if i is num - 2
)
processRow = ($, row, callback) ->
elements = $(row).children('td')
item = {}
city = null
nameStr = elements?.eq(0).html()
nameStr ?= ""
item.kind = nameStr.substring 0, 2
item.name = nameStr.substring 3
if elements.size() > 2
free = elements?.eq(2).html()
spaces = elements?.eq(1).html()
item.free = free
item.spaces = spaces
item.status = "open"
else if elements.size() > 0
item.status = "closed"
else
header = $(row).children().first().html()
currentCity = header.split(' ')[1]
city = {}
city.name = currentCity
item.city = currentCity
if item.name is "" or not item.name? then item = null
# cleanup
elements = null
$ = null
callback(item, city)
cacheJson = () ->
fetch(
(err, result) ->
util.log err if err?
data = result ? []
util.log 'Fetched ' + data?.parkings?.length + ' entries'
)
scrapeIntervalId = setInterval cacheJson, scrapeDelay
Solution
I got here reading about garbage collection and memory leakage in CS. It’s an old post but I ‘ll give my 2 cents anyway. When emptying an array it’s often recommended to avoid using array = []
, but use array.length = 0
.
Also, in functions that you don’t indent to return, perhaps it’s better to implicitly return undefined so that to avoid returning functions just because they appear at the very end of your CS function call.