From eb138fc96011167138159590178f9584d6972014 Mon Sep 17 00:00:00 2001 From: Curtis Castrapel Date: Wed, 8 Apr 2020 08:38:40 -0700 Subject: [PATCH] Add default celery metrics and logging using celery signals --- lemur/common/celery.py | 223 +++++++++++++++++++++++++++++++++-------- 1 file changed, 180 insertions(+), 43 deletions(-) diff --git a/lemur/common/celery.py b/lemur/common/celery.py index ebf85ed7..b0193515 100644 --- a/lemur/common/celery.py +++ b/lemur/common/celery.py @@ -10,27 +10,27 @@ command: celery -A lemur.common.celery worker --loglevel=info -l DEBUG -B import copy import sys import time -from datetime import datetime, timezone, timedelta - from celery import Celery +from celery.app.task import Context from celery.exceptions import SoftTimeLimitExceeded +from celery.signals import task_failure, task_received, task_revoked, task_success +from datetime import datetime, timezone, timedelta from flask import current_app from lemur.authorities.service import get as get_authority +from lemur.certificates import cli as cli_certificate from lemur.common.redis import RedisHandler from lemur.destinations import service as destinations_service +from lemur.dns_providers import cli as cli_dns_providers +from lemur.endpoints import cli as cli_endpoints from lemur.extensions import metrics, sentry from lemur.factory import create_app +from lemur.notifications import cli as cli_notification from lemur.notifications.messaging import send_pending_failure_notification from lemur.pending_certificates import service as pending_certificate_service from lemur.plugins.base import plugins from lemur.sources.cli import clean, sync, validate_sources from lemur.sources.service import add_aws_destination_to_sources -from lemur.certificates import cli as cli_certificate -from lemur.dns_providers import cli as cli_dns_providers -from lemur.notifications import cli as cli_notification -from lemur.endpoints import cli as cli_endpoints - if current_app: flask_app = current_app @@ -67,7 +67,7 @@ def is_task_active(fun, task_id, args): from celery.task.control import inspect if not args: - args = '()' # empty args + args = "()" # empty args i = inspect() active_tasks = i.active() @@ -80,6 +80,37 @@ def is_task_active(fun, task_id, args): return False +def get_celery_request_tags(**kwargs): + request = kwargs.get("request") + sender_hostname = "unknown" + sender = kwargs.get("sender") + if sender: + try: + sender_hostname = sender.hostname + except AttributeError: + sender_hostname = vars(sender.request).get("origin", "unknown") + if request and not isinstance( + request, Context + ): # unlike others, task_revoked sends a Context for `request` + task_name = request.name + task_id = request.id + receiver_hostname = request.hostname + else: + task_name = sender.name + task_id = sender.request.id + receiver_hostname = sender.request.hostname + + tags = { + "task_name": task_name, + "task_id": task_id, + "sender_hostname": sender_hostname, + "receiver_hostname": receiver_hostname, + } + if kwargs.get("exception"): + tags["error"] = repr(kwargs["exception"]) + return tags + + @celery.task() def report_celery_last_success_metrics(): """ @@ -108,15 +139,115 @@ def report_celery_last_success_metrics(): return current_time = int(time.time()) - schedule = current_app.config.get('CELERYBEAT_SCHEDULE') + schedule = current_app.config.get("CELERYBEAT_SCHEDULE") for _, t in schedule.items(): task = t.get("task") last_success = int(red.get(f"{task}.last_success") or 0) - metrics.send(f"{task}.time_since_last_success", 'gauge', current_time - last_success) + metrics.send( + f"{task}.time_since_last_success", "gauge", current_time - last_success + ) red.set( f"{function}.last_success", int(time.time()) ) # Alert if this metric is not seen - metrics.send(f"{function}.success", 'counter', 1) + metrics.send(f"{function}.success", "counter", 1) + + +@task_received.connect +def report_number_pending_tasks(**kwargs): + """ + Report the number of pending tasks to our metrics broker every time a task is published. This metric can be used + for autoscaling workers. + https://docs.celeryproject.org/en/latest/userguide/signals.html#task-received + + :param sender: + :param headers: + :param body: + :param kwargs: + :return: + """ + with flask_app.app_context(): + metrics.send( + "celery.new_pending_task", + "TIMER", + 1, + metric_tags=get_celery_request_tags(**kwargs), + ) + + +@task_success.connect +def report_successful_task(**kwargs): + """ + Report a generic success metric as tasks to our metrics broker every time a task finished correctly. + This metric can be used for autoscaling workers. + https://docs.celeryproject.org/en/latest/userguide/signals.html#task-success + + :param sender: + :param headers: + :param body: + :param kwargs: + :return: + """ + with flask_app.app_context(): + tags = get_celery_request_tags(**kwargs) + red.set(f"{tags['task_name']}.last_success", int(time.time())) + metrics.send("celery.successful_task", "TIMER", 1, metric_tags=tags) + + +@task_failure.connect +def report_failed_task(**kwargs): + """ + Report a generic failure metric as tasks to our metrics broker every time a task fails. + This metric can be used for alerting. + https://docs.celeryproject.org/en/latest/userguide/signals.html#task-failure + + :param sender: + :param headers: + :param body: + :param kwargs: + :return: + """ + with flask_app.app_context(): + log_data = { + "function": f"{__name__}.{sys._getframe().f_code.co_name}", + "Message": "Celery Task Failure", + } + + # Add traceback if exception info is in the kwargs + einfo = kwargs.get("einfo") + if einfo: + log_data["traceback"] = einfo.traceback + + error_tags = get_celery_request_tags(**kwargs) + + log_data.update(error_tags) + current_app.logger.error(log_data) + metrics.send("celery.failed_task", "TIMER", 1, metric_tags=error_tags) + + +@task_revoked.connect +def report_revoked_task(**kwargs): + """ + Report a generic failure metric as tasks to our metrics broker every time a task is revoked. + This metric can be used for alerting. + https://docs.celeryproject.org/en/latest/userguide/signals.html#task-revoked + + :param sender: + :param headers: + :param body: + :param kwargs: + :return: + """ + with flask_app.app_context(): + log_data = { + "function": f"{__name__}.{sys._getframe().f_code.co_name}", + "Message": "Celery Task Revoked", + } + + error_tags = get_celery_request_tags(**kwargs) + + log_data.update(error_tags) + current_app.logger.error(log_data) + metrics.send("celery.revoked_task", "TIMER", 1, metric_tags=error_tags) @celery.task(soft_time_limit=600) @@ -217,15 +348,15 @@ def fetch_acme_cert(id): log_data["failed"] = failed log_data["wrong_issuer"] = wrong_issuer current_app.logger.debug(log_data) - metrics.send(f"{function}.resolved", 'gauge', new) - metrics.send(f"{function}.failed", 'gauge', failed) - metrics.send(f"{function}.wrong_issuer", 'gauge', wrong_issuer) + metrics.send(f"{function}.resolved", "gauge", new) + metrics.send(f"{function}.failed", "gauge", failed) + metrics.send(f"{function}.wrong_issuer", "gauge", wrong_issuer) print( "[+] Certificates: New: {new} Failed: {failed} Not using ACME: {wrong_issuer}".format( new=new, failed=failed, wrong_issuer=wrong_issuer ) ) - red.set(f'{function}.last_success', int(time.time())) + return log_data @celery.task() @@ -262,8 +393,8 @@ def fetch_all_pending_acme_certs(): current_app.logger.debug(log_data) fetch_acme_cert.delay(cert.id) - red.set(f'{function}.last_success', int(time.time())) - metrics.send(f"{function}.success", 'counter', 1) + metrics.send(f"{function}.success", "counter", 1) + return log_data @celery.task() @@ -296,8 +427,8 @@ def remove_old_acme_certs(): current_app.logger.debug(log_data) pending_certificate_service.delete(cert) - red.set(f'{function}.last_success', int(time.time())) - metrics.send(f"{function}.success", 'counter', 1) + metrics.send(f"{function}.success", "counter", 1) + return log_data @celery.task() @@ -328,8 +459,8 @@ def clean_all_sources(): current_app.logger.debug(log_data) clean_source.delay(source.label) - red.set(f'{function}.last_success', int(time.time())) - metrics.send(f"{function}.success", 'counter', 1) + metrics.send(f"{function}.success", "counter", 1) + return log_data @celery.task(soft_time_limit=3600) @@ -366,6 +497,7 @@ def clean_source(source): current_app.logger.error(log_data) sentry.captureException() metrics.send("celery.timeout", "counter", 1, metric_tags={"function": function}) + return log_data @celery.task() @@ -395,8 +527,8 @@ def sync_all_sources(): current_app.logger.debug(log_data) sync_source.delay(source.label) - red.set(f'{function}.last_success', int(time.time())) - metrics.send(f"{function}.success", 'counter', 1) + metrics.send(f"{function}.success", "counter", 1) + return log_data @celery.task(soft_time_limit=7200) @@ -428,19 +560,23 @@ def sync_source(source): current_app.logger.debug(log_data) try: sync([source]) - metrics.send(f"{function}.success", 'counter', 1, metric_tags={"source": source}) + metrics.send( + f"{function}.success", "counter", 1, metric_tags={"source": source} + ) except SoftTimeLimitExceeded: log_data["message"] = "Error syncing source: Time limit exceeded." current_app.logger.error(log_data) sentry.captureException() - metrics.send("sync_source_timeout", "counter", 1, metric_tags={"source": source}) + metrics.send( + "sync_source_timeout", "counter", 1, metric_tags={"source": source} + ) metrics.send("celery.timeout", "counter", 1, metric_tags={"function": function}) return log_data["message"] = "Done syncing source" current_app.logger.debug(log_data) - metrics.send(f"{function}.success", 'counter', 1, metric_tags={"source": source}) - red.set(f'{function}.last_success', int(time.time())) + metrics.send(f"{function}.success", "counter", 1, metric_tags={"source": source}) + return log_data @celery.task() @@ -477,8 +613,8 @@ def sync_source_destination(): log_data["message"] = "completed Syncing AWS destinations and sources" current_app.logger.debug(log_data) - red.set(f'{function}.last_success', int(time.time())) - metrics.send(f"{function}.success", 'counter', 1) + metrics.send(f"{function}.success", "counter", 1) + return log_data @celery.task(soft_time_limit=3600) @@ -515,8 +651,8 @@ def certificate_reissue(): log_data["message"] = "reissuance completed" current_app.logger.debug(log_data) - red.set(f'{function}.last_success', int(time.time())) - metrics.send(f"{function}.success", 'counter', 1) + metrics.send(f"{function}.success", "counter", 1) + return log_data @celery.task(soft_time_limit=3600) @@ -534,7 +670,6 @@ def certificate_rotate(): "function": function, "message": "rotating certificates", "task_id": task_id, - } if task_id and is_task_active(function, task_id, None): @@ -554,8 +689,8 @@ def certificate_rotate(): log_data["message"] = "rotation completed" current_app.logger.debug(log_data) - red.set(f'{function}.last_success', int(time.time())) - metrics.send(f"{function}.success", 'counter', 1) + metrics.send(f"{function}.success", "counter", 1) + return log_data @celery.task(soft_time_limit=3600) @@ -590,8 +725,8 @@ def endpoints_expire(): metrics.send("celery.timeout", "counter", 1, metric_tags={"function": function}) return - red.set(f'{function}.last_success', int(time.time())) - metrics.send(f"{function}.success", 'counter', 1) + metrics.send(f"{function}.success", "counter", 1) + return log_data @celery.task(soft_time_limit=600) @@ -626,8 +761,8 @@ def get_all_zones(): metrics.send("celery.timeout", "counter", 1, metric_tags={"function": function}) return - red.set(f'{function}.last_success', int(time.time())) - metrics.send(f"{function}.success", 'counter', 1) + metrics.send(f"{function}.success", "counter", 1) + return log_data @celery.task(soft_time_limit=3600) @@ -662,8 +797,8 @@ def check_revoked(): metrics.send("celery.timeout", "counter", 1, metric_tags={"function": function}) return - red.set(f'{function}.last_success', int(time.time())) - metrics.send(f"{function}.success", 'counter', 1) + metrics.send(f"{function}.success", "counter", 1) + return log_data @celery.task(soft_time_limit=3600) @@ -690,7 +825,9 @@ def notify_expirations(): current_app.logger.debug(log_data) try: - cli_notification.expirations(current_app.config.get("EXCLUDE_CN_FROM_NOTIFICATION", [])) + cli_notification.expirations( + current_app.config.get("EXCLUDE_CN_FROM_NOTIFICATION", []) + ) except SoftTimeLimitExceeded: log_data["message"] = "Notify expiring Time limit exceeded." current_app.logger.error(log_data) @@ -698,5 +835,5 @@ def notify_expirations(): metrics.send("celery.timeout", "counter", 1, metric_tags={"function": function}) return - red.set(f'{function}.last_success', int(time.time())) - metrics.send(f"{function}.success", 'counter', 1) + metrics.send(f"{function}.success", "counter", 1) + return log_data