From 2bc604e5a9821552906ed8706ecf2d5a7fe00e7a Mon Sep 17 00:00:00 2001 From: Curtis Castrapel Date: Thu, 25 Apr 2019 13:50:41 -0700 Subject: [PATCH] Better metrics and error reporting --- lemur/plugins/lemur_acme/dyn.py | 50 +++++++++++++++++++++++++----- lemur/plugins/lemur_acme/plugin.py | 7 +++-- lemur/plugins/lemur_aws/elb.py | 23 +++++++++++--- 3 files changed, 64 insertions(+), 16 deletions(-) diff --git a/lemur/plugins/lemur_acme/dyn.py b/lemur/plugins/lemur_acme/dyn.py index 232c6ca3..4eb01958 100644 --- a/lemur/plugins/lemur_acme/dyn.py +++ b/lemur/plugins/lemur_acme/dyn.py @@ -5,7 +5,7 @@ import dns.exception import dns.name import dns.query import dns.resolver -from dyn.tm.errors import DynectCreateError, DynectGetError +from dyn.tm.errors import DynectCreateError, DynectDeleteError, DynectGetError, DynectUpdateError from dyn.tm.session import DynectSession from dyn.tm.zones import Node, Zone, get_all_zones from flask import current_app @@ -51,17 +51,23 @@ def _has_dns_propagated(name, token): def wait_for_dns_change(change_id, account_number=None): fqdn, token = change_id - number_of_attempts = 10 + number_of_attempts = 20 for attempts in range(0, number_of_attempts): status = _has_dns_propagated(fqdn, token) current_app.logger.debug("Record status for fqdn: {}: {}".format(fqdn, status)) if status: metrics.send('wait_for_dns_change_success', 'counter', 1) break - time.sleep(20) + time.sleep(10) if not status: # TODO: Delete associated DNS text record here metrics.send('wait_for_dns_change_fail', 'counter', 1) + sentry.captureException( + extra={ + "fqdn": fqdn, "txt_record": token} + ) + metrics.send('wait_for_dns_change_error', 'counter', 1, + metric_tags={'fqdn': fqdn, 'txt_record': token}) raise Exception("Unable to query DNS token for fqdn {}.".format(fqdn)) return @@ -105,7 +111,7 @@ def create_txt_record(domain, token, account_number): zone.add_record(node_name, record_type='TXT', txtdata="\"{}\"".format(token), ttl=5) zone.publish() current_app.logger.debug("TXT record created: {0}, token: {1}".format(fqdn, token)) - except DynectCreateError as e: + except (DynectCreateError, DynectUpdateError) as e: if "Cannot duplicate existing record data" in e.message: current_app.logger.debug( "Unable to add record. Domain: {}. Token: {}. " @@ -138,14 +144,33 @@ def delete_txt_record(change_id, account_number, domain, token): all_txt_records = node.get_all_records_by_type('TXT') except DynectGetError: sentry.captureException() - metrics.send('delete_txt_record_error', 'counter', 1) + metrics.send('delete_txt_record_geterror', 'counter', 1) # No Text Records remain or host is not in the zone anymore because all records have been deleted. return for txt_record in all_txt_records: if txt_record.txtdata == ("{}".format(token)): current_app.logger.debug("Deleting TXT record name: {0}".format(fqdn)) - txt_record.delete() - zone.publish() + try: + txt_record.delete() + except DynectDeleteError: + sentry.captureException( + extra={ + "fqdn": fqdn, "zone_name": zone_name, "node_name": node_name, + "txt_record": txt_record.txtdata} + ) + metrics.send('delete_txt_record_deleteerror', 'counter', 1, + metric_tags={'fqdn': fqdn, 'txt_record': txt_record.txtdata}) + + try: + zone.publish() + except DynectUpdateError: + sentry.captureException( + extra={ + "fqdn": fqdn, "zone_name": zone_name, "node_name": node_name, + "txt_record": txt_record.txtdata} + ) + metrics.send('delete_txt_record_publish_error', 'counter', 1, + metric_tags={'fqdn': fqdn, 'txt_record': txt_record.txtdata}) def delete_acme_txt_records(domain): @@ -171,7 +196,16 @@ def delete_acme_txt_records(domain): all_txt_records = node.get_all_records_by_type('TXT') for txt_record in all_txt_records: current_app.logger.debug("Deleting TXT record name: {0}".format(fqdn)) - txt_record.delete() + try: + txt_record.delete() + except DynectDeleteError: + sentry.captureException( + extra={ + "fqdn": fqdn, "zone_name": zone_name, "node_name": node_name, + "txt_record": txt_record.txtdata} + ) + metrics.send('delete_txt_record_deleteerror', 'counter', 1, + metric_tags={'fqdn': fqdn, 'txt_record': txt_record.txtdata}) zone.publish() diff --git a/lemur/plugins/lemur_acme/plugin.py b/lemur/plugins/lemur_acme/plugin.py index c998c0b8..4360e5b5 100644 --- a/lemur/plugins/lemur_acme/plugin.py +++ b/lemur/plugins/lemur_acme/plugin.py @@ -142,9 +142,9 @@ class AcmeHandler(object): try: orderr = acme_client.finalize_order(order, deadline) except AcmeError: - sentry.captureException() + sentry.captureException(extra={"order_url": order.uri}) metrics.send('request_certificate_error', 'counter', 1) - current_app.logger.error(f"Unable to resolve Acme order: {order}", exc_info=True) + current_app.logger.error(f"Unable to resolve Acme order: {order.uri}", exc_info=True) raise pem_certificate = OpenSSL.crypto.dump_certificate(OpenSSL.crypto.FILETYPE_PEM, @@ -289,9 +289,10 @@ class AcmeHandler(object): dns_challenges = authz_record.dns_challenge host_to_validate = self.maybe_remove_wildcard(authz_record.host) host_to_validate = self.maybe_add_extension(host_to_validate, dns_provider_options) + dns_provider_plugin = self.get_dns_provider(dns_provider.provider_type) for dns_challenge in dns_challenges: try: - dns_provider.delete_txt_record( + dns_provider_plugin.delete_txt_record( authz_record.change_id, account_number, dns_challenge.validation_domain_name(host_to_validate), diff --git a/lemur/plugins/lemur_aws/elb.py b/lemur/plugins/lemur_aws/elb.py index b4391dd8..43d99ff2 100644 --- a/lemur/plugins/lemur_aws/elb.py +++ b/lemur/plugins/lemur_aws/elb.py @@ -10,7 +10,7 @@ from flask import current_app from retrying import retry -from lemur.extensions import metrics +from lemur.extensions import metrics, sentry from lemur.exceptions import InvalidListener from lemur.plugins.lemur_aws.sts import sts_client @@ -149,7 +149,7 @@ def describe_listeners_v2(**kwargs): @sts_client('elb') -@retry(retry_on_exception=retry_throttled, wait_fixed=2000) +@retry(retry_on_exception=retry_throttled, wait_fixed=2000, stop_max_attempt_number=20) def describe_load_balancer_policies(load_balancer_name, policy_names, **kwargs): """ Fetching all policies currently associated with an ELB. @@ -157,11 +157,18 @@ def describe_load_balancer_policies(load_balancer_name, policy_names, **kwargs): :param load_balancer_name: :return: """ - return kwargs['client'].describe_load_balancer_policies(LoadBalancerName=load_balancer_name, PolicyNames=policy_names) + try: + return kwargs['client'].describe_load_balancer_policies(LoadBalancerName=load_balancer_name, + PolicyNames=policy_names) + except Exception as e: # noqa + metrics.send('describe_load_balancer_policies_fail', 'counter', 1, + metric_tags={"load_balancer_name": load_balancer_name, "policy_names": policy_names, "error": e}) + sentry.captureException(extra={"load_balancer_name": load_balancer_name, "policy_names": policy_names}) + raise @sts_client('elbv2') -@retry(retry_on_exception=retry_throttled, wait_fixed=2000) +@retry(retry_on_exception=retry_throttled, wait_fixed=2000, stop_max_attempt_number=20) def describe_ssl_policies_v2(policy_names, **kwargs): """ Fetching all policies currently associated with an ELB. @@ -169,7 +176,13 @@ def describe_ssl_policies_v2(policy_names, **kwargs): :param policy_names: :return: """ - return kwargs['client'].describe_ssl_policies(Names=policy_names) + try: + return kwargs['client'].describe_ssl_policies(Names=policy_names) + except Exception as e: # noqa + metrics.send('describe_ssl_policies_v2_fail', 'counter', 1, + metric_tags={"policy_names": policy_names, "error": e}) + sentry.captureException(extra={"policy_names": policy_names}) + raise @sts_client('elb')