[{"data":1,"prerenderedAt":980},["ShallowReactive",2],{"\u002Fblog\u002Fwhat-to-monitor-checklist":3},{"id":4,"title":5,"author":6,"body":8,"category":968,"date":969,"description":970,"extension":971,"image":972,"lastUpdated":972,"meta":973,"navigation":974,"path":975,"readingTime":976,"seo":977,"stem":978,"__hash__":979},"blog\u002Fblog\u002Fwhat-to-monitor-checklist.md","What to Monitor: The Complete Checklist for SaaS, E-commerce, and APIs",{"name":7},"Vantaj Team",{"type":9,"value":10,"toc":944},"minimark",[11,15,18,25,28,33,36,41,135,146,150,204,208,272,275,279,329,331,335,338,403,406,408,412,415,469,471,475,478,564,567,569,573,576,662,665,667,671,674,749,751,755,758,792,795,799,891,894,898,902,905,909,912,916,919,923,926],[12,13,14],"p",{},"The most common question from teams setting up monitoring for the first time is: what should I actually be watching?",[12,16,17],{},"Most guides list monitor types. This one tells you which specific endpoints, certificates, jobs, and records to monitor, organized by priority, so you can set up a complete monitoring stack without missing the things that matter.",[12,19,20,24],{},[21,22,23],"strong",{},"Priority key:"," 🔴 Critical: alert immediately. 🟡 Important: alert within 5 minutes. 🟢 Informational: daily digest is sufficient.",[26,27],"hr",{},[29,30,32],"h2",{"id":31},"http-and-application-monitors","HTTP and Application Monitors",[12,34,35],{},"These confirm your application is responding correctly, not just that the server is running.",[37,38,40],"h3",{"id":39},"for-every-product","For Every Product",[42,43,44,61],"table",{},[45,46,47],"thead",{},[48,49,50,54,58],"tr",{},[51,52,53],"th",{},"Monitor",[51,55,57],{"align":56},"center","Priority",[51,59,60],{},"Why",[62,63,64,76,87,97,115,125],"tbody",{},[48,65,66,70,73],{},[67,68,69],"td",{},"Homepage \u002F root URL",[67,71,72],{"align":56},"🟡",[67,74,75],{},"First thing customers check when something feels wrong",[48,77,78,81,84],{},[67,79,80],{},"Login \u002F auth endpoint",[67,82,83],{"align":56},"🔴",[67,85,86],{},"If users can't log in, the rest of the product is irrelevant",[48,88,89,92,94],{},[67,90,91],{},"Primary API endpoint",[67,93,83],{"align":56},[67,95,96],{},"The most-called endpoint your product depends on",[48,98,99,102,104],{},[67,100,101],{},"Health check endpoint",[67,103,83],{"align":56},[67,105,106,110,111,114],{},[107,108,109],"code",{},"\u002Fhealth"," or ",[107,112,113],{},"\u002Fping","; your own team uses this to verify recovery",[48,116,117,120,122],{},[67,118,119],{},"Signup \u002F registration",[67,121,72],{"align":56},[67,123,124],{},"A broken signup flow means zero new users until someone notices",[48,126,127,130,132],{},[67,128,129],{},"Password reset",[67,131,72],{"align":56},[67,133,134],{},"Silent broken state; only surfaces when a user is locked out",[12,136,137,138,141,142,145],{},"Set up the health check endpoint if you don't already have one. A simple ",[107,139,140],{},"GET \u002Fhealth"," returning ",[107,143,144],{},"{\"status\": \"ok\"}"," with a 200 is enough. During an incident, this is the fastest way to confirm recovery.",[37,147,149],{"id":148},"additional-checks-for-saas-products","Additional Checks for SaaS Products",[42,151,152,162],{},[45,153,154],{},[48,155,156,158,160],{},[51,157,53],{},[51,159,57],{"align":56},[51,161,60],{},[62,163,164,174,184,194],{},[48,165,166,169,171],{},[67,167,168],{},"Core feature API",[67,170,83],{"align":56},[67,172,173],{},"The endpoint behind your product's primary value",[48,175,176,179,181],{},[67,177,178],{},"Webhook delivery endpoint",[67,180,72],{"align":56},[67,182,183],{},"Webhook failures are silent: customers see nothing, their integrations just stop",[48,185,186,189,191],{},[67,187,188],{},"Billing \u002F subscription API",[67,190,72],{"align":56},[67,192,193],{},"A broken billing page blocks upgrades and causes churn at renewal",[48,195,196,199,201],{},[67,197,198],{},"User dashboard",[67,200,72],{"align":56},[67,202,203],{},"The page users land on after login; degraded performance is noticed immediately",[37,205,207],{"id":206},"additional-checks-for-e-commerce","Additional Checks for E-commerce",[42,209,210,220],{},[45,211,212],{},[48,213,214,216,218],{},[51,215,53],{},[51,217,57],{"align":56},[51,219,60],{},[62,221,222,232,242,252,262],{},[48,223,224,227,229],{},[67,225,226],{},"Product catalog \u002F listing page",[67,228,83],{"align":56},[67,230,231],{},"If products don't load, nothing sells",[48,233,234,237,239],{},[67,235,236],{},"Cart \u002F checkout page",[67,238,83],{"align":56},[67,240,241],{},"Direct, immediate, measurable revenue loss when broken",[48,243,244,247,249],{},[67,245,246],{},"Payment processor integration",[67,248,83],{"align":56},[67,250,251],{},"Stripe, Braintree, or PayPal endpoint; payment failures are the most urgent alert",[48,253,254,257,259],{},[67,255,256],{},"Order confirmation page",[67,258,83],{"align":56},[67,260,261],{},"Confirms the full purchase flow completed",[48,263,264,267,269],{},[67,265,266],{},"Search \u002F product search API",[67,268,72],{"align":56},[67,270,271],{},"Second most impactful e-commerce failure after checkout",[12,273,274],{},"For e-commerce, add a peak multiplier in your alerting expectations: a 4-hour outage during a 10x traffic period costs 10x as much as the same outage on a normal day. Check your checkout monitor first when something breaks during a sale event.",[37,276,278],{"id":277},"additional-checks-for-developer-apis","Additional Checks for Developer APIs",[42,280,281,291],{},[45,282,283],{},[48,284,285,287,289],{},[51,286,53],{},[51,288,57],{"align":56},[51,290,60],{},[62,292,293,306,316],{},[48,294,295,298,300],{},[67,296,297],{},"Primary API base URL",[67,299,83],{"align":56},[67,301,302,305],{},[107,303,304],{},"api.yourdomain.com"," with a lightweight authenticated request",[48,307,308,311,313],{},[67,309,310],{},"Auth \u002F token endpoint",[67,312,83],{"align":56},[67,314,315],{},"If auth breaks, all API consumers break simultaneously",[48,317,318,321,323],{},[67,319,320],{},"Documentation site",[67,322,72],{"align":56},[67,324,325,328],{},[107,326,327],{},"docs.yourdomain.com","; downtime during an evaluation kills deals",[26,330],{},[29,332,334],{"id":333},"ssl-certificate-monitors","SSL Certificate Monitors",[12,336,337],{},"SSL failures block all users immediately. The browser shows a full-page warning; most users don't click through. Set expiry alerts well in advance, because 7 days is too short if renewal requires vendor coordination or a DNS change.",[42,339,340,351],{},[45,341,342],{},[48,343,344,346,348],{},[51,345,53],{},[51,347,57],{"align":56},[51,349,350],{},"Recommended alert thresholds",[62,352,353,363,373,383,393],{},[48,354,355,358,360],{},[67,356,357],{},"Primary domain SSL",[67,359,83],{"align":56},[67,361,362],{},"90, 60, 30, 7, 1 day before expiry",[48,364,365,368,370],{},[67,366,367],{},"API subdomain SSL",[67,369,83],{"align":56},[67,371,372],{},"Same; expires independently of your main domain",[48,374,375,378,380],{},[67,376,377],{},"App subdomain SSL",[67,379,83],{"align":56},[67,381,382],{},"Same",[48,384,385,388,390],{},[67,386,387],{},"Docs \u002F marketing subdomains",[67,389,72],{"align":56},[67,391,392],{},"30, 7, 1 day before expiry",[48,394,395,398,400],{},[67,396,397],{},"Custom customer domains",[67,399,72],{"align":56},[67,401,402],{},"If you support CNAME-based custom domains, monitor a sample set; auto-renewal failures are common here",[12,404,405],{},"Don't rely on auto-renewal alone. Let's Encrypt, AWS ACM, and commercial CA portals all have failure modes: DNS validation errors, expired billing, misconfigured ACME clients, CDN certificate caching. Monitoring catches silent renewal failures before they cause outages.",[26,407],{},[29,409,411],{"id":410},"domain-expiry-monitors","Domain Expiry Monitors",[12,413,414],{},"Domain expiry is rarer than SSL expiry but more catastrophic. An expired domain takes your entire product offline, including the SSL certificate, DNS, and email. Recovery involves your registrar's support queue.",[42,416,417,427],{},[45,418,419],{},[48,420,421,423,425],{},[51,422,53],{},[51,424,57],{"align":56},[51,426,350],{},[62,428,429,439,459],{},[48,430,431,434,436],{},[67,432,433],{},"Primary domain",[67,435,83],{"align":56},[67,437,438],{},"90, 60, 30, 14 days before expiry",[48,440,441,444,446],{},[67,442,443],{},"Brand protection domains",[67,445,72],{"align":56},[67,447,448,451,452,451,455,458],{},[107,449,450],{},".io",", ",[107,453,454],{},".co",[107,456,457],{},".net"," variants you own; expiry lets squatters take them",[48,460,461,464,466],{},[67,462,463],{},"Acquired product domains",[67,465,72],{"align":56},[67,467,468],{},"Alert at 60 days; these often have different registrar accounts",[26,470],{},[29,472,474],{"id":473},"heartbeat-monitors","Heartbeat Monitors",[12,476,477],{},"Heartbeat monitoring inverts the check: instead of you pinging the job, the job pings a URL on each successful run. If the ping stops arriving, the monitor alerts. This is the only reliable way to detect silent cron failures.",[42,479,480,491],{},[45,481,482],{},[48,483,484,487,489],{},[51,485,486],{},"Job",[51,488,57],{"align":56},[51,490,60],{},[62,492,493,503,513,523,533,543,553],{},[48,494,495,498,500],{},[67,496,497],{},"Database backup job",[67,499,83],{"align":56},[67,501,502],{},"A backup that silently stops running is a disaster waiting for a trigger",[48,504,505,508,510],{},[67,506,507],{},"Billing renewal \u002F subscription sync",[67,509,83],{"align":56},[67,511,512],{},"Subscription states diverge from your payment processor; silent revenue loss",[48,514,515,518,520],{},[67,516,517],{},"Email delivery queue",[67,519,83],{"align":56},[67,521,522],{},"Transactional emails (receipts, resets, notifications) stop without any error",[48,524,525,528,530],{},[67,526,527],{},"User notification job",[67,529,72],{"align":56},[67,531,532],{},"Digest emails, alerts, summaries; users notice when these go missing",[48,534,535,538,540],{},[67,536,537],{},"Data sync \u002F ETL pipeline",[67,539,72],{"align":56},[67,541,542],{},"Stale data surfaces as product bugs, not monitoring alerts",[48,544,545,548,550],{},[67,546,547],{},"Report generation job",[67,549,72],{"align":56},[67,551,552],{},"Scheduled reports that internal teams rely on",[48,554,555,558,561],{},[67,556,557],{},"Cleanup \u002F maintenance jobs",[67,559,560],{"align":56},"🟢",[67,562,563],{},"Log rotation, temp file cleanup, expired session purge",[12,565,566],{},"Configure heartbeat intervals to match your cron schedule plus a 10–20% grace period. A job that runs every hour should have a heartbeat window of 66–72 minutes, not 60, to account for startup time and processing delays.",[26,568],{},[29,570,572],{"id":571},"tcp-port-monitors","TCP Port Monitors",[12,574,575],{},"Use for services that don't expose HTTP endpoints.",[42,577,578,590],{},[45,579,580],{},[48,581,582,585,588],{},[51,583,584],{},"Port",[51,586,587],{},"Service",[51,589,57],{"align":56},[62,591,592,602,612,622,632,642,652],{},[48,593,594,597,600],{},[67,595,596],{},"5432",[67,598,599],{},"PostgreSQL",[67,601,83],{"align":56},[48,603,604,607,610],{},[67,605,606],{},"3306",[67,608,609],{},"MySQL",[67,611,83],{"align":56},[48,613,614,617,620],{},[67,615,616],{},"27017",[67,618,619],{},"MongoDB",[67,621,83],{"align":56},[48,623,624,627,630],{},[67,625,626],{},"6379",[67,628,629],{},"Redis",[67,631,83],{"align":56},[48,633,634,637,640],{},[67,635,636],{},"587 \u002F 465",[67,638,639],{},"SMTP",[67,641,72],{"align":56},[48,643,644,647,650],{},[67,645,646],{},"22",[67,648,649],{},"SSH",[67,651,72],{"align":56},[48,653,654,657,660],{},[67,655,656],{},"3389",[67,658,659],{},"RDP",[67,661,560],{"align":56},[12,663,664],{},"A database host that stops accepting TCP connections causes application failures that surface as HTTP 500 errors, not as \"database unavailable.\" The TCP port monitor tells you the failure is at the infrastructure layer before you spend 30 minutes debugging application code.",[26,666],{},[29,668,670],{"id":669},"dns-monitors","DNS Monitors",[12,672,673],{},"DNS changes are rare, which is exactly why unexpected changes are significant. Alert on any value change rather than setting specific thresholds; the expected value of an NS record should never change without advance planning.",[42,675,676,688],{},[45,677,678],{},[48,679,680,683,685],{},[51,681,682],{},"Record",[51,684,57],{"align":56},[51,686,687],{},"Alert condition",[62,689,690,700,710,720,729,739],{},[48,691,692,695,697],{},[67,693,694],{},"Primary domain A record",[67,696,83],{"align":56},[67,698,699],{},"Any IP address change",[48,701,702,705,707],{},[67,703,704],{},"NS records",[67,706,83],{"align":56},[67,708,709],{},"Any change; unexpected NS changes are the strongest signal of DNS hijacking",[48,711,712,715,717],{},[67,713,714],{},"MX records",[67,716,72],{"align":56},[67,718,719],{},"Any change; stops email delivery for your entire domain",[48,721,722,725,727],{},[67,723,724],{},"API subdomain A record",[67,726,72],{"align":56},[67,728,699],{},[48,730,731,734,736],{},[67,732,733],{},"SPF TXT record",[67,735,560],{"align":56},[67,737,738],{},"Value change; affects email deliverability and spam filter performance",[48,740,741,744,746],{},[67,742,743],{},"DMARC TXT record",[67,745,560],{"align":56},[67,747,748],{},"Value change",[26,750],{},[29,752,754],{"id":753},"recommended-setup-order","Recommended Setup Order",[12,756,757],{},"If you're starting from zero, this order prioritizes coverage of the most impactful failures:",[759,760,761,765,768,771,774,777,780,783,786,789],"ol",{},[762,763,764],"li",{},"Login endpoint (HTTP)",[762,766,767],{},"Primary API endpoint (HTTP)",[762,769,770],{},"Primary domain SSL certificate",[762,772,773],{},"Homepage (HTTP)",[762,775,776],{},"Checkout or core feature endpoint (HTTP)",[762,778,779],{},"Primary domain expiry (WHOIS\u002FRDAP)",[762,781,782],{},"Database backup cron (heartbeat)",[762,784,785],{},"Billing sync cron (heartbeat)",[762,787,788],{},"Database TCP port",[762,790,791],{},"NS records (DNS)",[12,793,794],{},"These 10 monitors cover the failures most likely to affect users and the silent failures most likely to compound into larger problems. Add the rest of the list once these are stable.",[29,796,798],{"id":797},"monitor-settings-reference","Monitor Settings Reference",[42,800,801,814],{},[45,802,803],{},[48,804,805,808,811],{},[51,806,807],{},"Monitor type",[51,809,810],{"align":56},"Check interval",[51,812,813],{},"Alert after",[62,815,816,827,838,849,860,871,880],{},[48,817,818,821,824],{},[67,819,820],{},"HTTP: critical endpoints",[67,822,823],{"align":56},"1 minute",[67,825,826],{},"2 consecutive failures from all regions",[48,828,829,832,835],{},[67,830,831],{},"HTTP: secondary pages",[67,833,834],{"align":56},"5 minutes",[67,836,837],{},"2 consecutive failures",[48,839,840,843,846],{},[67,841,842],{},"SSL certificate",[67,844,845],{"align":56},"12 hours",[67,847,848],{},"At 90\u002F60\u002F30\u002F7\u002F1 days before expiry",[48,850,851,854,857],{},[67,852,853],{},"Domain expiry",[67,855,856],{"align":56},"Daily",[67,858,859],{},"At 90\u002F60\u002F30\u002F14 days before expiry",[48,861,862,865,868],{},[67,863,864],{},"Heartbeat",[67,866,867],{"align":56},"Match cron schedule + 10%",[67,869,870],{},"1 missed expected ping",[48,872,873,876,878],{},[67,874,875],{},"TCP port",[67,877,834],{"align":56},[67,879,837],{},[48,881,882,885,888],{},[67,883,884],{},"DNS record",[67,886,887],{"align":56},"15 minutes",[67,889,890],{},"Any value change",[12,892,893],{},"Requiring 2 consecutive failures before alerting eliminates most false positives caused by transient network issues. A monitor checking every minute that requires 2 consecutive failures still alerts within 2 minutes of a real outage, fast enough for any production incident.",[29,895,897],{"id":896},"frequently-asked-questions","Frequently Asked Questions",[37,899,901],{"id":900},"how-many-monitors-do-i-need","How many monitors do I need?",[12,903,904],{},"For a typical SaaS product, 15–25 monitors covers everything: 6–10 HTTP checks, 3–5 SSL certificates, 1–2 domain expiry monitors, 3–5 heartbeat monitors, and a handful of DNS and TCP checks. More monitors add coverage; they don't improve detection speed for the monitors you already have.",[37,906,908],{"id":907},"should-i-monitor-staging-as-well-as-production","Should I monitor staging as well as production?",[12,910,911],{},"Monitor production first, completely. Staging monitors are useful for catching deployment issues before they reach production, but they're a secondary concern. A broken staging environment that hasn't been monitored for a week costs nothing; a broken production login endpoint that hasn't been monitored for an hour costs customers.",[37,913,915],{"id":914},"what-check-interval-should-i-use","What check interval should I use?",[12,917,918],{},"1 minute for anything customer-facing that generates revenue or blocks access. 5 minutes for secondary pages. Faster than 1 minute is rarely necessary; most outages aren't recovered in under a minute, so additional checks don't change your response time.",[37,920,922],{"id":921},"do-i-need-separate-tools-for-each-monitor-type","Do I need separate tools for each monitor type?",[12,924,925],{},"No. Vantaj monitors HTTP endpoints, SSL certificates, domain expiry, heartbeats, TCP ports, and DNS records from a single dashboard. The free tier covers 20 monitors across all types, enough to get full coverage for most small products.",[12,927,928,929,451,934,938,939,943],{},"For a deeper look at each monitor type, see ",[930,931,933],"a",{"href":932},"\u002Fblog\u002Ficmp-ping-monitoring","ICMP ping monitoring",[930,935,937],{"href":936},"\u002Fblog\u002Fheartbeat-monitoring-cron-jobs","heartbeat monitoring for cron jobs",", and ",[930,940,942],{"href":941},"\u002Fblog\u002Fdns-monitoring-guide","DNS monitoring",".",{"title":945,"searchDepth":946,"depth":946,"links":947},"",2,[948,955,956,957,958,959,960,961,962],{"id":31,"depth":946,"text":32,"children":949},[950,952,953,954],{"id":39,"depth":951,"text":40},3,{"id":148,"depth":951,"text":149},{"id":206,"depth":951,"text":207},{"id":277,"depth":951,"text":278},{"id":333,"depth":946,"text":334},{"id":410,"depth":946,"text":411},{"id":473,"depth":946,"text":474},{"id":571,"depth":946,"text":572},{"id":669,"depth":946,"text":670},{"id":753,"depth":946,"text":754},{"id":797,"depth":946,"text":798},{"id":896,"depth":946,"text":897,"children":963},[964,965,966,967],{"id":900,"depth":951,"text":901},{"id":907,"depth":951,"text":908},{"id":914,"depth":951,"text":915},{"id":921,"depth":951,"text":922},"tutorials","2026-06-26","47 prioritized checks across HTTP, SSL, domain expiry, heartbeat, TCP, and DNS, organized by business type. Use this when setting up monitoring from scratch or auditing an existing setup.","md",null,{},true,"\u002Fblog\u002Fwhat-to-monitor-checklist",10,{"title":5,"description":970},"blog\u002Fwhat-to-monitor-checklist","nzfOT2D_OebwNP7Kns96L8xMw7XqKIWAJjs9n4Gceo4",1782464113731]