Memory leaks and performance regressions in production Node.js applications can be subtle โ heap memory growing 1MB per hour eventually causes an OOM crash after days, and a 10ms latency increase per deployment compounds into seconds over time. Diagnosing these issues requires production-safe profiling tools, automated memory monitoring, and a systematic approach to finding the specific code paths and data structures responsible. This lesson builds the complete production observability and diagnosis toolkit for the MEAN Stack API.
Memory Leak Common Sources
| Leak Source | Pattern | Detection |
|---|---|---|
| Global variable accumulation | Data pushed to module-level arrays/maps and never removed | Heap snapshot: object count growth in same class |
| Event listener accumulation | Listeners added on every request to long-lived emitters | EventEmitter.listenerCount() growing |
| Closure retention | Closures capturing large objects that outlive their use | Heap snapshot: retained size dominators |
| Cache without eviction | Map/object used as cache with no TTL or max size | Heap snapshot: Map/Object size growing |
| Promise leak | Promises created but never resolved or rejected | Pending promise count growing |
| Timer accumulation | setInterval without clearInterval, especially in hot paths | Active handles count: process._getActiveHandles() |
clinic.js in staging to identify memory leaks before they reach production. clinic heapprofile -- node server.js records a heap allocation profile over time, showing which call stacks are responsible for allocations that end up in old space. Unlike heap snapshots (point-in-time), allocation profiles show the cause of growth rather than just the result. Run it while replaying a production traffic pattern for the most accurate results.process.memoryUsage() polling or the default Node.js metrics from Prometheus’s prom-client โ these have negligible overhead.Complete Production Diagnostics
// โโ src/diagnostics/memory-monitor.js โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
const v8 = require('v8');
const path = require('path');
const { logger } = require('../config/logger');
// Memory trend tracking โ detect gradual growth
const SAMPLE_SIZE = 10;
const heapSamples = [];
let gcCount = 0;
function recordHeapSample() {
const { heapUsed, heapTotal, rss, external } = process.memoryUsage();
const heapStats = v8.getHeapStatistics();
heapSamples.push({
timestamp: Date.now(),
heapUsedMB: heapUsed / 1e6,
heapPct: heapUsed / heapStats.heap_size_limit,
});
if (heapSamples.length > SAMPLE_SIZE) heapSamples.shift();
// Detect monotonic growth โ potential leak
if (heapSamples.length === SAMPLE_SIZE) {
const growthPerSample = (heapSamples[SAMPLE_SIZE-1].heapUsedMB - heapSamples[0].heapUsedMB) / SAMPLE_SIZE;
if (growthPerSample > 5) { // growing more than 5MB per sample
logger.warn('Potential memory leak detected', {
growthPerSampleMB: growthPerSample.toFixed(2),
currentMB: heapSamples[SAMPLE_SIZE-1].heapUsedMB.toFixed(1),
});
}
}
// Critical threshold โ take snapshot
const pct = heapUsed / heapStats.heap_size_limit;
if (pct > 0.85) {
takeHeapSnapshot();
}
// Log metrics
logger.debug('Heap status', {
heapUsedMB: (heapUsed / 1e6).toFixed(1),
heapPct: (pct * 100).toFixed(1) + '%',
rssMB: (rss / 1e6).toFixed(1),
gcCount,
});
}
// GC monitoring
const { PerformanceObserver } = require('perf_hooks');
const gcObserver = new PerformanceObserver(list => {
for (const entry of list.getEntries()) {
gcCount++;
if (entry.duration > 100) {
logger.warn('Long GC pause', {
kind: entry.detail?.kind,
duration: entry.duration.toFixed(2) + 'ms',
});
}
}
});
gcObserver.observe({ entryTypes: ['gc'] });
let lastSnapshotTime = 0;
function takeHeapSnapshot() {
const now = Date.now();
if (now - lastSnapshotTime < 5 * 60 * 1000) return; // max once per 5 min
lastSnapshotTime = now;
const filename = path.join('/tmp', `heap-${now}.heapsnapshot`);
try {
const written = v8.writeHeapSnapshot(filename);
logger.warn('Heap snapshot written', { path: written });
} catch (err) {
logger.error('Failed to write heap snapshot', { error: err.message });
}
}
// Start monitoring every 30 seconds
function startMemoryMonitoring() {
setInterval(recordHeapSample, 30_000).unref(); // .unref() โ don't prevent process exit
}
module.exports = { startMemoryMonitoring };
// โโ src/diagnostics/event-loop-monitor.js โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
// Detect event loop blocking (>50ms is a problem)
function monitorEventLoop(threshold = 50) {
const { monitorEventLoopDelay } = require('perf_hooks');
const histogram = monitorEventLoopDelay({ resolution: 10 });
histogram.enable();
setInterval(() => {
const p99 = histogram.percentile(99) / 1e6; // ns โ ms
if (p99 > threshold) {
logger.warn('Event loop delay detected', {
p99Ms: p99.toFixed(2),
maxMs: (histogram.max / 1e6).toFixed(2),
});
}
histogram.reset();
}, 60_000).unref();
}
// โโ src/diagnostics/active-handles.js โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
// Diagnose what is keeping Node.js alive (useful in shutdown debugging)
function logActiveHandles() {
const handles = process._getActiveHandles();
const requests = process._getActiveRequests();
logger.debug('Active handles', {
handleCount: handles.length,
handleTypes: [...new Set(handles.map(h => h.constructor.name))],
requestCount: requests.length,
});
}
// โโ Diagnostic HTTP endpoint (admin-only) โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
// src/routes/diagnostics.routes.js
const router = require('express').Router();
const { requireRole } = require('../middleware/rbac.middleware');
// Admin-only diagnostic endpoints
router.use(requireRole('admin'));
router.get('/diagnostics/memory', (req, res) => {
const mem = process.memoryUsage();
const stats = v8.getHeapStatistics();
res.json({
heapUsedMB: (mem.heapUsed / 1e6).toFixed(2),
heapTotalMB: (mem.heapTotal / 1e6).toFixed(2),
heapLimitMB: (stats.heap_size_limit / 1e6).toFixed(2),
heapUsedPct: ((mem.heapUsed / stats.heap_size_limit) * 100).toFixed(1) + '%',
rssMB: (mem.rss / 1e6).toFixed(2),
externalMB: (mem.external / 1e6).toFixed(2),
uptimeSeconds: process.uptime(),
});
});
router.post('/diagnostics/gc', (req, res) => {
if (!global.gc) {
return res.status(400).json({ message: 'GC not exposed. Start with --expose-gc' });
}
const before = process.memoryUsage().heapUsed;
global.gc();
const after = process.memoryUsage().heapUsed;
res.json({
freedMB: ((before - after) / 1e6).toFixed(2),
heapUsedAfterMB: (after / 1e6).toFixed(2),
});
});
router.post('/diagnostics/heap-snapshot', async (req, res) => {
const filename = path.join('/tmp', `heap-manual-${Date.now()}.heapsnapshot`);
const written = v8.writeHeapSnapshot(filename);
res.json({ path: written, message: 'Heap snapshot written to disk' });
});
module.exports = router;
How It Works
Step 1 โ Monotonic Heap Growth Is the Leak Signal
A healthy heap fluctuates โ it grows as objects are allocated and shrinks after GC. A leaking heap grows consistently over time even after GC cycles. The monotonic growth detector keeps a sliding window of heap samples and flags when the trend is consistently upward. 5MB growth per 30-second sample (150MB/hour) would trigger an alert and prompt investigation before it causes an OOM crash.
Step 2 โ GC Monitoring Correlates Pauses with Latency Spikes
The PerformanceObserver for GC events captures every garbage collection with its duration. A major GC taking 200ms explains a 200ms latency spike in your metrics for that moment. Correlating GC pauses with request latency spikes in your monitoring dashboard helps distinguish “slow query” from “GC pause” as the root cause of latency anomalies.
Step 3 โ Event Loop Delay Monitoring Detects Blocking Code
monitorEventLoopDelay() returns a histogram that measures how long setInterval callbacks are delayed past their scheduled time. A 50ms delay means the event loop was busy for 50ms processing a previous task โ during which time all other requests were queued. A p99 of 100ms means 1% of all 10ms timer callbacks waited 100ms โ correlating with p99 HTTP latency of 100ms plus normal processing time.
Step 4 โ .unref() Prevents Monitor from Blocking Shutdown
Calling .unref() on a timer (setInterval/setTimeout) means the timer does not prevent the Node.js process from exiting if it is the only active handle. Without .unref(), the 30-second monitoring interval would keep the process alive after all HTTP connections are closed during graceful shutdown โ causing a 30-second maximum delay. .unref() monitoring timers are “background” tasks that yield to shutdown.
Step 5 โ Admin Diagnostic Endpoints Enable On-Demand Diagnosis
Admin-only HTTP endpoints for memory status, manual GC, and heap snapshot creation enable on-demand diagnosis without SSHing into the server. The memory endpoint provides the current heap statistics in a readable format. The heap snapshot endpoint triggers a snapshot that can be downloaded and opened in Chrome DevTools. These endpoints must be protected by the admin role guard โ exposing them publicly reveals internal application state.
Quick Reference
| Task | Code |
|---|---|
| Heap usage | process.memoryUsage().heapUsed / 1e6 + 'MB' |
| Heap limit | v8.getHeapStatistics().heap_size_limit |
| Write heap snapshot | v8.writeHeapSnapshot('/tmp/heap.heapsnapshot') |
| Monitor GC | new PerformanceObserver(cb).observe({ entryTypes: ['gc'] }) |
| Event loop delay | perf_hooks.monitorEventLoopDelay({ resolution: 10 }) |
| Active handles | process._getActiveHandles() |
| Force GC | global.gc() (requires --expose-gc flag) |
| Don’t block shutdown | setInterval(fn, ms).unref() |