1770 update log format and add additional fields to webhook server logs (#1771)

* 1770 update log format and add runID and Id to worflow logs update tests, change log format for controllers.HorizontalRunnerAutoscalerGitHubWebhook use logging package remove unused modules add setup name to setuplog add flag to change log format change flag name to enableProdLogConfig move log opts to logger package remove empty else and reset timeEncoder update flag description use get function to handle nil rename flag and update logger function Update main.go Co-authored-by: Yusuke Kuoka <ykuoka@gmail.com> Update controllers/horizontal_runner_autoscaler_webhook.go Co-authored-by: Yusuke Kuoka <ykuoka@gmail.com> Update logging/logger.go Co-authored-by: Yusuke Kuoka <ykuoka@gmail.com> copy log opt per each NewLogger call revert to use autoscaler.log update flag descript and remove unused imports add logFormat to readme rename setupLog to logger make fmt * Fix E2E along the way Co-authored-by: Yusuke Kuoka <ykuoka@gmail.com>
2025-12-11 03:57:01 +00:00 · 2022-11-04 01:46:58 +00:00
parent 63e8f32281
commit fbdfe0df8c
12 changed files with 150 additions and 60 deletions
--- a/controllers/horizontal_runner_autoscaler_webhook.go
+++ b/controllers/horizontal_runner_autoscaler_webhook.go
@@ -235,6 +235,8 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) Handle(w http.Respons
 				"repository.owner.type", e.Repo.Owner.GetType(),
 				"enterprise.slug", enterpriseSlug,
 				"action", e.GetAction(),
+				"workflowJob.runID", e.WorkflowJob.GetRunID(),
+				"workflowJob.ID", e.WorkflowJob.GetID(),
 			)
 		}

@@ -343,7 +345,7 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) Handle(w http.Respons

 	msg := fmt.Sprintf("scaled %s by %d", target.Name, target.Amount)

-	autoscaler.Log.Info(msg)
+	log.Info(msg)

 	if written, err := w.Write([]byte(msg)); err != nil {
 		log.Error(err, "failed writing http response", "msg", msg, "written", written)
--- a/controllers/integration_test.go
+++ b/controllers/integration_test.go
@@ -616,14 +616,14 @@ var _ = Context("INTEGRATION: Inside of a new namespace", func() {

 			// Scale-up to 2 replicas on first workflow_job.queued webhook event
 			{
-				env.SendWorkflowJobEvent("test", "valid", "queued", []string{"self-hosted"})
+				env.SendWorkflowJobEvent("test", "valid", "queued", []string{"self-hosted"}, int64(1234), int64(4321))
 				ExpectRunnerSetsManagedReplicasCountEventuallyEquals(ctx, ns.Name, 2, "runners after first webhook event")
 				env.ExpectRegisteredNumberCountEventuallyEquals(2, "count of fake list runners")
 			}

 			// Scale-up to 3 replicas on second workflow_job.queued webhook event
 			{
-				env.SendWorkflowJobEvent("test", "valid", "queued", []string{"self-hosted"})
+				env.SendWorkflowJobEvent("test", "valid", "queued", []string{"self-hosted"}, int64(1234), int64(4321))
 				ExpectRunnerSetsManagedReplicasCountEventuallyEquals(ctx, ns.Name, 3, "runners after second webhook event")
 				env.ExpectRegisteredNumberCountEventuallyEquals(3, "count of fake list runners")
 			}
@@ -631,7 +631,7 @@ var _ = Context("INTEGRATION: Inside of a new namespace", func() {
 			// Do not scale-up on third workflow_job.queued webhook event
 			// repo "example" doesn't match our Spec
 			{
-				env.SendWorkflowJobEvent("test", "example", "queued", []string{"self-hosted"})
+				env.SendWorkflowJobEvent("test", "example", "queued", []string{"self-hosted"}, int64(1234), int64(4321))
 				ExpectRunnerSetsManagedReplicasCountEventuallyEquals(ctx, ns.Name, 3, "runners after third webhook event")
 				env.ExpectRegisteredNumberCountEventuallyEquals(3, "count of fake list runners")
 			}
@@ -1250,7 +1250,7 @@ var _ = Context("INTEGRATION: Inside of a new namespace", func() {

 			// Scale-up to 2 replicas on first workflow_job webhook event
 			{
-				env.SendWorkflowJobEvent("test", "valid", "queued", []string{"self-hosted"})
+				env.SendWorkflowJobEvent("test", "valid", "queued", []string{"self-hosted"}, int64(1234), int64(4321))
 				ExpectRunnerSetsCountEventuallyEquals(ctx, ns.Name, 1, "runner sets after webhook")
 				ExpectRunnerSetsManagedReplicasCountEventuallyEquals(ctx, ns.Name, 2, "runners after first webhook event")
 				env.ExpectRegisteredNumberCountEventuallyEquals(2, "count of fake list runners")
@@ -1334,7 +1334,7 @@ var _ = Context("INTEGRATION: Inside of a new namespace", func() {

 			// Scale-up to 2 replicas on first workflow_job webhook event
 			{
-				env.SendWorkflowJobEvent("test", "valid", "queued", []string{"custom-label"})
+				env.SendWorkflowJobEvent("test", "valid", "queued", []string{"custom-label"}, int64(1234), int64(4321))
 				ExpectRunnerSetsCountEventuallyEquals(ctx, ns.Name, 1, "runner sets after webhook")
 				ExpectRunnerSetsManagedReplicasCountEventuallyEquals(ctx, ns.Name, 2, "runners after first webhook event")
 				env.ExpectRegisteredNumberCountEventuallyEquals(2, "count of fake list runners")
@@ -1415,9 +1415,11 @@ func (env *testEnvironment) SendOrgCheckRunEvent(org, repo, status, action strin
 	ExpectWithOffset(1, resp.StatusCode).To(Equal(200))
 }

-func (env *testEnvironment) SendWorkflowJobEvent(org, repo, statusAndAction string, labels []string) {
+func (env *testEnvironment) SendWorkflowJobEvent(org, repo, statusAndAction string, labels []string, runID int64, ID int64) {
 	resp, err := sendWebhook(env.webhookServer, "workflow_job", &github.WorkflowJobEvent{
 		WorkflowJob: &github.WorkflowJob{
+			ID:     &ID,
+			RunID:  &runID,
 			Status: &statusAndAction,
 			Labels: labels,
 		},
--- a/controllers/runner_graceful_stop.go
+++ b/controllers/runner_graceful_stop.go
@@ -110,9 +110,23 @@ func ensureRunnerUnregistration(ctx context.Context, retryDelay time.Duration, l
 		// Note: This logic is here to prevent a dead-lock between ARC and the PV provider.
 		//
 		// The author of this logic thinks that some (or all?) of CSI plugins and PV providers
-		// do not supported to provision dynamic PVs for a pod that is already marked for deletion.
+		// do not support provisioning dynamic PVs for a pod that is already marked for deletion.
 		// If we didn't handle this case here, ARC would end up with waiting forever until the
-		// PV provider to provision PVs for the pod, which seems to never happen.
+		// PV provider(s) provision PVs for the pod, which seems to never happen.
+		//
+		// For reference, the below is an eaxmple of pod.status that you might see when it happened:
+		// status:
+		//  conditions:
+		//  - lastProbeTime: null
+		//    lastTransitionTime: "2022-11-04T00:04:05Z"
+		//    message: 'binding rejected: running Bind plugin "DefaultBinder": Operation cannot
+		//      be fulfilled on pods/binding "org-runnerdeploy-xv2lg-pm6t2": pod org-runnerdeploy-xv2lg-pm6t2
+		//      is being deleted, cannot be assigned to a host'
+		//    reason: SchedulerError
+		//    status: "False"
+		//    type: PodScheduled
+		//  phase: Pending
+		//  qosClass: BestEffort
 		log.Info(
 			"Unregistration started before runner pod gets scheduled onto a node. "+
 				"Perhaps the runner is taking a long time due to e.g. slow CSI slugin not giving us a PV in a timely manner, or your Kubernetes cluster is overloaded? "+