Warn when a job should start but is still running

This will make it easier for users to debug hung jobs: rather than
requiring users to go back to when the job started to realize
(implicitly) that it's not running, this will warn at every interval
that something is wrong.
This commit is contained in:
Thomas Orozco
2018-01-10 13:37:52 +01:00
parent 8e0d9341a2
commit 9a964c8cb5
3 changed files with 38 additions and 10 deletions

View File

@@ -2,6 +2,7 @@ package cron
import (
"bufio"
"context"
"fmt"
"github.com/aptible/supercronic/crontab"
"github.com/sirupsen/logrus"
@@ -59,17 +60,17 @@ func startReaderDrain(wg *sync.WaitGroup, readerLogger *logrus.Entry, reader io.
}()
}
func runJob(context *crontab.Context, command string, jobLogger *logrus.Entry) error {
func runJob(cronCtx *crontab.Context, command string, jobLogger *logrus.Entry) error {
jobLogger.Info("starting")
cmd := exec.Command(context.Shell, "-c", command)
cmd := exec.Command(cronCtx.Shell, "-c", command)
// Run in a separate process group so that in interactive usage, CTRL+C
// stops supercronic, not the children threads.
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
env := os.Environ()
for k, v := range context.Environ {
for k, v := range cronCtx.Environ {
env = append(env, fmt.Sprintf("%s=%s", k, v))
}
cmd.Env = env
@@ -105,7 +106,22 @@ func runJob(context *crontab.Context, command string, jobLogger *logrus.Entry) e
return nil
}
func StartJob(wg *sync.WaitGroup, context *crontab.Context, job *crontab.Job, exitChan chan interface{}, cronLogger *logrus.Entry) {
func monitorJob(ctx context.Context, expression crontab.Expression, t0 time.Time, jobLogger *logrus.Entry) {
t := t0
for {
t = expression.Next(t)
select {
case <-time.After(time.Until(t)):
jobLogger.Warnf("not starting: job is still running since %s (%s elapsed)", t0, t.Sub(t0))
case <-ctx.Done():
return
}
}
}
func StartJob(wg *sync.WaitGroup, cronCtx *crontab.Context, job *crontab.Job, exitChan chan interface{}, cronLogger *logrus.Entry) {
wg.Add(1)
go func() {
@@ -139,7 +155,14 @@ func StartJob(wg *sync.WaitGroup, context *crontab.Context, job *crontab.Job, ex
"iteration": cronIteration,
})
err := runJob(context, job.Command, jobLogger)
err := func() error {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
go monitorJob(ctx, job.Expression, nextRun, jobLogger)
return runJob(cronCtx, job.Command, jobLogger)
}()
if err == nil {
jobLogger.Info("job succeeded")

View File

@@ -1,12 +1,12 @@
function run_supercronic() {
local crontab="$1"
local timeout="${2:-2s}"
local timeout="${2:-1s}"
timeout --preserve-status --kill-after "30s" "$timeout" \
"${BATS_TEST_DIRNAME}/../supercronic" ${SUPERCRONIC_ARGS:-} "$crontab" 2>&1
}
@test "it starts" {
run_supercronic "${BATS_TEST_DIRNAME}/noop.crontab" 2s
run_supercronic "${BATS_TEST_DIRNAME}/noop.crontab"
}
@test "it runs a cron job" {
@@ -23,11 +23,16 @@ function run_supercronic() {
}
@test "it warns when USER is set" {
run_supercronic "${BATS_TEST_DIRNAME}/user.crontab" 5s | grep -iE "processes will not.*USER="
run_supercronic "${BATS_TEST_DIRNAME}/user.crontab" 1s | grep -iE "processes will not.*USER="
}
@test "it warns when a job is falling behind" {
run_supercronic "${BATS_TEST_DIRNAME}/timeout.crontab" 5s | grep -iE "job took too long to run"
run_supercronic "${BATS_TEST_DIRNAME}/timeout.crontab" 1s | grep -iE "job took too long to run"
}
@test "it warns repeatedly when a job is still running" {
n="$(run_supercronic "${BATS_TEST_DIRNAME}/timeout.crontab" 1s | grep -iE "job is still running" | wc -l)"
[[ "$n" -eq 2 ]]
}
@test "it supports debug logging " {

View File

@@ -1 +1 @@
* * * * * * * sleep 2
* * * * * * * sleep 2.5