Merge pull request #2828 from hosseinsh/improved-logging-alerting-v2

Improved logging alerting v2
2019-07-17 11:33:45 -07:00
parent d199819d66 a3706594fc
commit ee151bdc43
5 changed files with 117 additions and 4 deletions
--- a/lemur/common/celery.py
+++ b/lemur/common/celery.py
@@ -9,6 +9,7 @@ command: celery -A lemur.common.celery worker --loglevel=info -l DEBUG -B
 """
 import copy
 import sys
 import time
 from datetime import datetime, timezone, timedelta
 from celery import Celery
@@ -16,6 +17,7 @@ from celery.exceptions import SoftTimeLimitExceeded
 from flask import current_app
 from lemur.authorities.service import get as get_authority
 from lemur.common.redis import RedisHandler
 from lemur.destinations import service as destinations_service
 from lemur.extensions import metrics, sentry
 from lemur.factory import create_app
@@ -30,6 +32,8 @@ if current_app:
 else:
    flask_app = create_app()
 red = RedisHandler().redis()
 def make_celery(app):
    celery = Celery(
@@ -68,6 +72,30 @@ def is_task_active(fun, task_id, args):
    return False
@celery.task()
 def report_celery_last_success_metrics():
    """
    For each celery task, this will determine the number of seconds since it has last been successful.
    Celery tasks should be emitting redis stats with a deterministic key (In our case, `f"{task}.last_success"`.
    report_celery_last_success_metrics should be ran periodically to emit metrics on when a task was last successful.
    Admins can then alert when tasks are not ran when intended. Admins should also alert when no metrics are emitted
    from this function.
    """
    function = f"{__name__}.{sys._getframe().f_code.co_name}"
    current_time = int(time.time())
    schedule = current_app.config.get('CELERYBEAT_SCHEDULE')
    for _, t in schedule.items():
        task = t.get("task")
        last_success = int(red.get(f"{task}.last_success") or 0)
        metrics.send(f"{task}.time_since_last_success", 'gauge', current_time - last_success)
    red.set(
        f"{function}.last_success", int(time.time())
    )  # Alert if this metric is not seen
    metrics.send(f"{function}.success", 'counter', 1)
@celery.task(soft_time_limit=600)
 def fetch_acme_cert(id):
    """
@@ -80,8 +108,9 @@ def fetch_acme_cert(id):
    if celery.current_task:
        task_id = celery.current_task.request.id
    function = f"{__name__}.{sys._getframe().f_code.co_name}"
    log_data = {
-        "function": "{}.{}".format(__name__, sys._getframe().f_code.co_name),
+        "function": function,
        "message": "Resolving pending certificate {}".format(id),
        "task_id": task_id,
        "id": id,
@@ -165,11 +194,15 @@ def fetch_acme_cert(id):
    log_data["failed"] = failed
    log_data["wrong_issuer"] = wrong_issuer
    current_app.logger.debug(log_data)
    metrics.send(f"{function}.resolved", 'gauge', new)
    metrics.send(f"{function}.failed", 'gauge', failed)
    metrics.send(f"{function}.wrong_issuer", 'gauge', wrong_issuer)
    print(
        "[+] Certificates: New: {new} Failed: {failed} Not using ACME: {wrong_issuer}".format(
            new=new, failed=failed, wrong_issuer=wrong_issuer
        )
    )
    red.set(f'{function}.last_success', int(time.time()))
@celery.task()
@@ -177,8 +210,9 @@ def fetch_all_pending_acme_certs():
    """Instantiate celery workers to resolve all pending Acme certificates"""
    pending_certs = pending_certificate_service.get_unresolved_pending_certs()
    function = f"{__name__}.{sys._getframe().f_code.co_name}"
    log_data = {
-        "function": "{}.{}".format(__name__, sys._getframe().f_code.co_name),
+        "function": function,
        "message": "Starting job.",
    }
@@ -195,11 +229,18 @@ def fetch_all_pending_acme_certs():
                current_app.logger.debug(log_data)
                fetch_acme_cert.delay(cert.id)
    red.set(f'{function}.last_success', int(time.time()))
    metrics.send(f"{function}.success", 'counter', 1)
@celery.task()
 def remove_old_acme_certs():
    """Prune old pending acme certificates from the database"""
-    log_data = {"function": "{}.{}".format(__name__, sys._getframe().f_code.co_name)}
+    function = f"{__name__}.{sys._getframe().f_code.co_name}"
    log_data = {
        "function": function,
        "message": "Starting job.",
    }
    pending_certs = pending_certificate_service.get_pending_certs("all")
    # Delete pending certs more than a week old
@@ -211,6 +252,9 @@ def remove_old_acme_certs():
            current_app.logger.debug(log_data)
            pending_certificate_service.delete(cert)
    red.set(f'{function}.last_success', int(time.time()))
    metrics.send(f"{function}.success", 'counter', 1)
@celery.task()
 def clean_all_sources():
@@ -218,6 +262,7 @@ def clean_all_sources():
    This function will clean unused certificates from sources. This is a destructive operation and should only
    be ran periodically. This function triggers one celery task per source.
    """
    function = f"{__name__}.{sys._getframe().f_code.co_name}"
    sources = validate_sources("all")
    for source in sources:
        current_app.logger.debug(
@@ -225,6 +270,9 @@ def clean_all_sources():
        )
        clean_source.delay(source.label)
    red.set(f'{function}.last_success', int(time.time()))
    metrics.send(f"{function}.success", 'counter', 1)
@celery.task()
 def clean_source(source):
@@ -244,6 +292,7 @@ def sync_all_sources():
    """
    This function will sync certificates from all sources. This function triggers one celery task per source.
    """
    function = f"{__name__}.{sys._getframe().f_code.co_name}"
    sources = validate_sources("all")
    for source in sources:
        current_app.logger.debug(
@@ -251,6 +300,9 @@ def sync_all_sources():
        )
        sync_source.delay(source.label)
    red.set(f'{function}.last_success', int(time.time()))
    metrics.send(f"{function}.success", 'counter', 1)
@celery.task(soft_time_limit=7200)
 def sync_source(source):
@@ -261,7 +313,7 @@ def sync_source(source):
    :return:
    """
-    function = "{}.{}".format(__name__, sys._getframe().f_code.co_name)
+    function = f"{__name__}.{sys._getframe().f_code.co_name}"
    task_id = None
    if celery.current_task:
        task_id = celery.current_task.request.id
@@ -279,6 +331,7 @@ def sync_source(source):
        return
    try:
        sync([source])
        metrics.send(f"{function}.success", 'counter', '1', metric_tags={"source": source})
    except SoftTimeLimitExceeded:
        log_data["message"] = "Error syncing source: Time limit exceeded."
        current_app.logger.error(log_data)
@@ -290,6 +343,8 @@ def sync_source(source):
    log_data["message"] = "Done syncing source"
    current_app.logger.debug(log_data)
    metrics.send(f"{function}.success", 'counter', 1, metric_tags=source)
    red.set(f'{function}.last_success', int(time.time()))
@celery.task()
@@ -302,9 +357,12 @@ def sync_source_destination():
    We rely on account numbers to avoid duplicates.
    """
    current_app.logger.debug("Syncing AWS destinations and sources")
    function = f"{__name__}.{sys._getframe().f_code.co_name}"
    for dst in destinations_service.get_all():
        if add_aws_destination_to_sources(dst):
            current_app.logger.debug("Source: %s added", dst.label)
    current_app.logger.debug("Completed Syncing AWS destinations and sources")
    red.set(f'{function}.last_success', int(time.time()))
    metrics.send(f"{function}.success", 'counter', 1)
--- a/lemur/common/redis.py
+++ b/lemur/common/redis.py
@@ -0,0 +1,38 @@
 """
 Helper Class for Redis
 """
 import redis
 from flask import current_app
 from lemur.factory import create_app
 if current_app:
    flask_app = current_app
 else:
    flask_app = create_app()
 class RedisHandler:
    def __init__(self, host=flask_app.config.get('REDIS_HOST', 'localhost'),
                 port=flask_app.config.get('REDIS_PORT', 6379),
                 db=flask_app.config.get('REDIS_DB', 0)):
        self.host = host
        self.port = port
        self.db = db
    def redis(self, db=0):
        # The decode_responses flag here directs the client to convert the responses from Redis into Python strings
        # using the default encoding utf-8.  This is client specific.
        red = redis.StrictRedis(host=self.host, port=self.port, db=self.db, encoding="utf-8", decode_responses=True)
        return red
 def redis_get(key, default=None):
    red = RedisHandler().redis()
    try:
        v = red.get(key)
    except redis.exceptions.ConnectionError:
        v = None
    if not v:
        return default
    return v
--- a/lemur/tests/test_redis.py
+++ b/lemur/tests/test_redis.py
@@ -0,0 +1,13 @@
 import fakeredis
 import time
 import sys
 def test_write_and_read_from_redis():
    function = f"{__name__}.{sys._getframe().f_code.co_name}"
    red = fakeredis.FakeStrictRedis()
    key = f"{function}.last_success"
    value = int(time.time())
    assert red.set(key, value) is True
    assert (int(red.get(key)) == value) is True
--- a/requirements-tests.in
+++ b/requirements-tests.in
@@ -5,6 +5,7 @@ black
 coverage
 factory-boy
 Faker
 fakeredis
 freezegun
 moto
 nose
--- a/requirements-tests.txt
+++ b/requirements-tests.txt
@@ -28,6 +28,7 @@ docutils==0.14            # via botocore
 ecdsa==0.13.2             # via python-jose, sshpubkeys
 factory-boy==2.12.0
 faker==1.0.7
 fakeredis==1.0.3
 flask==1.1.1              # via pytest-flask
 freezegun==0.3.12
 future==0.17.1            # via aws-xray-sdk, python-jose
@@ -64,6 +65,7 @@ python-dateutil==2.8.0    # via botocore, faker, freezegun, moto
 python-jose==3.0.1        # via moto
 pytz==2019.1              # via datetime, moto
 pyyaml==5.1.1
 redis==3.2.1              # via fakeredis
 requests-mock==1.6.0
 requests==2.22.0          # via cfn-lint, docker, moto, requests-mock, responses
 responses==0.10.6         # via moto
@@ -72,6 +74,7 @@ s3transfer==0.2.1         # via boto3
 six==1.12.0               # via aws-sam-translator, bandit, cfn-lint, cryptography, docker, faker, freezegun, jsonschema, mock, moto, packaging, pyrsistent, python-dateutil, python-jose, requests-mock, responses, stevedore, websocket-client
 smmap2==2.0.5             # via gitdb2
 sshpubkeys==3.1.0         # via moto
 sortedcontainers==2.1.0   # via fakeredis
 stevedore==1.30.1         # via bandit
 text-unidecode==1.2       # via faker
 toml==0.10.0              # via black