Memory leak in a scraper [closed]

Posted on

Problem

This is a scraper written in CoffeeScript for NodeJS. It is run in an interval (set to 5 seconds here to dramtically show the leak). Somewhere this code leaks memory. I already tried nulling a few things for manually deleting references. Where is the reference kept that produces the leak?

request = require 'request'
jsdom   = require 'jsdom'
util    = require 'util'

scrapeURL   = 'http://kwlpls.adiwidjaja.com/index.php'
jqueryUrl   = 'http://code.jquery.com/jquery-1.6.1.min.js'
scrapeDivId = "cc-m-externalsource-container-m8a3ae44c30fa9708"

scrapeDelay = 5 * 1000 # 5 seconds
data = []

fetch = (callback) ->
    request { uri: scrapeURL }, (error, response, body) ->
        if error and response and response.statusCode isnt 200
            util.log 'Error when contacting #{scrapeURL}'

        # Fake Browser
        jsdom.env
            html: body,
            scripts: [jqueryUrl],
            (err, window) ->
                processPage(window, (result) ->
                    callback(err, result)
                )

processPage = (window, callback) ->
    # cleanup
    result           = {}
    result.cities    = []
    result.parkings  = []

    $    = window.jQuery
    rows = $('table').children()
    num  = $(rows).size()

    rows.each (i, row) ->
        if i > 0 and i isnt num - 1 # cut off header and footer
            processRow($, row, (item, city) ->
                result.parkings.push(item) if item?
                result.cities.push(city) if city?
                item = null
                city = null
                callback(result) if i is num - 2
            )

processRow = ($, row, callback) ->
    elements  = $(row).children('td')
    item      = {}
    city      = null
    nameStr   = elements?.eq(0).html()
    nameStr  ?= ""
    item.kind = nameStr.substring 0, 2
    item.name = nameStr.substring 3

    if elements.size() > 2
        free           = elements?.eq(2).html()
        spaces         = elements?.eq(1).html()
        item.free      = free
        item.spaces    = spaces
        item.status    = "open"
    else if elements.size() > 0
        item.status    = "closed"
    else
        header      = $(row).children().first().html()
        currentCity = header.split(' ')[1]
        city        = {}
        city.name   = currentCity

    item.city = currentCity
    if item.name is "" or not item.name? then item = null

    # cleanup
    elements = null
    $ = null

    callback(item, city)

cacheJson = () ->
    fetch(
        (err, result) ->
            util.log err if err?
            data = result ? []
            util.log 'Fetched ' + data?.parkings?.length + ' entries'
    )

scrapeIntervalId = setInterval cacheJson, scrapeDelay

Solution

I got here reading about garbage collection and memory leakage in CS. It’s an old post but I ‘ll give my 2 cents anyway. When emptying an array it’s often recommended to avoid using array = [], but use array.length = 0.

Also, in functions that you don’t indent to return, perhaps it’s better to implicitly return undefined so that to avoid returning functions just because they appear at the very end of your CS function call.

Leave a Reply

Your email address will not be published. Required fields are marked *