From 1e64851d791f9c0f03625f958496f5d2c87d17de Mon Sep 17 00:00:00 2001 From: Curtis Castrapel Date: Fri, 26 Apr 2019 10:16:18 -0700 Subject: [PATCH 1/2] Strip out self-polling logic and rely on ACME; Enhance ELB logging and retries --- lemur/plugins/lemur_acme/dyn.py | 17 ++-- lemur/plugins/lemur_acme/plugin.py | 44 ++++------- lemur/plugins/lemur_aws/elb.py | 121 +++++++++++++++++++---------- 3 files changed, 104 insertions(+), 78 deletions(-) diff --git a/lemur/plugins/lemur_acme/dyn.py b/lemur/plugins/lemur_acme/dyn.py index 4eb01958..4159532c 100644 --- a/lemur/plugins/lemur_acme/dyn.py +++ b/lemur/plugins/lemur_acme/dyn.py @@ -64,11 +64,10 @@ def wait_for_dns_change(change_id, account_number=None): metrics.send('wait_for_dns_change_fail', 'counter', 1) sentry.captureException( extra={ - "fqdn": fqdn, "txt_record": token} + "fqdn": str(fqdn), "txt_record": str(token)} ) metrics.send('wait_for_dns_change_error', 'counter', 1, metric_tags={'fqdn': fqdn, 'txt_record': token}) - raise Exception("Unable to query DNS token for fqdn {}.".format(fqdn)) return @@ -155,8 +154,8 @@ def delete_txt_record(change_id, account_number, domain, token): except DynectDeleteError: sentry.captureException( extra={ - "fqdn": fqdn, "zone_name": zone_name, "node_name": node_name, - "txt_record": txt_record.txtdata} + "fqdn": str(fqdn), "zone_name": str(zone_name), "node_name": str(node_name), + "txt_record": str(txt_record.txtdata)} ) metrics.send('delete_txt_record_deleteerror', 'counter', 1, metric_tags={'fqdn': fqdn, 'txt_record': txt_record.txtdata}) @@ -166,11 +165,11 @@ def delete_txt_record(change_id, account_number, domain, token): except DynectUpdateError: sentry.captureException( extra={ - "fqdn": fqdn, "zone_name": zone_name, "node_name": node_name, - "txt_record": txt_record.txtdata} + "fqdn": str(fqdn), "zone_name": str(zone_name), "node_name": str(node_name), + "txt_record": str(txt_record.txtdata)} ) metrics.send('delete_txt_record_publish_error', 'counter', 1, - metric_tags={'fqdn': fqdn, 'txt_record': txt_record.txtdata}) + metric_tags={'fqdn': str(fqdn), 'txt_record': str(txt_record.txtdata)}) def delete_acme_txt_records(domain): @@ -201,8 +200,8 @@ def delete_acme_txt_records(domain): except DynectDeleteError: sentry.captureException( extra={ - "fqdn": fqdn, "zone_name": zone_name, "node_name": node_name, - "txt_record": txt_record.txtdata} + "fqdn": str(fqdn), "zone_name": str(zone_name), "node_name": str(node_name), + "txt_record": str(txt_record.txtdata)} ) metrics.send('delete_txt_record_deleteerror', 'counter', 1, metric_tags={'fqdn': fqdn, 'txt_record': txt_record.txtdata}) diff --git a/lemur/plugins/lemur_acme/plugin.py b/lemur/plugins/lemur_acme/plugin.py index 7519c4c7..3350682c 100644 --- a/lemur/plugins/lemur_acme/plugin.py +++ b/lemur/plugins/lemur_acme/plugin.py @@ -102,49 +102,33 @@ class AcmeHandler(object): metrics.send('complete_dns_challenge_error_no_dnsproviders', 'counter', 1) raise Exception("No DNS providers found for domain: {}".format(authz_record.host)) - for dns_provider in dns_providers: - # Grab account number (For Route53) - dns_provider_options = json.loads(dns_provider.credentials) - account_number = dns_provider_options.get("account_id") - dns_provider_plugin = self.get_dns_provider(dns_provider.provider_type) - for change_id in authz_record.change_id: - try: - dns_provider_plugin.wait_for_dns_change(change_id, account_number=account_number) - except Exception: - metrics.send('complete_dns_challenge_error', 'counter', 1) - sentry.captureException() - current_app.logger.debug( - f"Unable to resolve DNS challenge for change_id: {change_id}, account_id: " - f"{account_number}", exc_info=True) - raise + for dns_challenge in authz_record.dns_challenge: + response = dns_challenge.response(acme_client.client.net.key) - for dns_challenge in authz_record.dns_challenge: - response = dns_challenge.response(acme_client.client.net.key) + verified = response.simple_verify( + dns_challenge.chall, + authz_record.host, + acme_client.client.net.key.public_key() + ) - verified = response.simple_verify( - dns_challenge.chall, - authz_record.host, - acme_client.client.net.key.public_key() - ) + if not verified: + metrics.send('complete_dns_challenge_verification_error', 'counter', 1) + raise ValueError("Failed verification") - if not verified: - metrics.send('complete_dns_challenge_verification_error', 'counter', 1) - raise ValueError("Failed verification") - - time.sleep(5) - acme_client.answer_challenge(dns_challenge, response) + time.sleep(5) + acme_client.answer_challenge(dns_challenge, response) def request_certificate(self, acme_client, authorizations, order): for authorization in authorizations: for authz in authorization.authz: authorization_resource, _ = acme_client.poll(authz) - deadline = datetime.datetime.now() + datetime.timedelta(seconds=90) + deadline = datetime.datetime.now() + datetime.timedelta(seconds=360) try: orderr = acme_client.poll_and_finalize(order, deadline) except AcmeError: - sentry.captureException(extra={"order_url": order.uri}) + sentry.captureException(extra={"order_url": str(order.uri)}) metrics.send('request_certificate_error', 'counter', 1) current_app.logger.error(f"Unable to resolve Acme order: {order.uri}", exc_info=True) raise diff --git a/lemur/plugins/lemur_aws/elb.py b/lemur/plugins/lemur_aws/elb.py index 43d99ff2..77e99d18 100644 --- a/lemur/plugins/lemur_aws/elb.py +++ b/lemur/plugins/lemur_aws/elb.py @@ -21,14 +21,22 @@ def retry_throttled(exception): :param exception: :return: """ + + # Log details about the exception + try: + raise exception + except Exception as e: + current_app.logger.error("ELB retry_throttled triggered", exc_info=True) + metrics.send('elb_retry', 'counter', 1, + metric_tags={"exception": e}) + sentry.captureException() + if isinstance(exception, botocore.exceptions.ClientError): if exception.response['Error']['Code'] == 'LoadBalancerNotFound': return False if exception.response['Error']['Code'] == 'CertificateNotFound': return False - - metrics.send('elb_retry', 'counter', 1) return True @@ -63,16 +71,20 @@ def get_all_elbs(**kwargs): :return: """ elbs = [] + try: + while True: + response = get_elbs(**kwargs) - while True: - response = get_elbs(**kwargs) + elbs += response['LoadBalancerDescriptions'] - elbs += response['LoadBalancerDescriptions'] - - if not response.get('NextMarker'): - return elbs - else: - kwargs.update(dict(Marker=response['NextMarker'])) + if not response.get('NextMarker'): + return elbs + else: + kwargs.update(dict(Marker=response['NextMarker'])) + except Exception as e: # noqa + metrics.send('get_all_elbs_error', 'counter', 1) + sentry.captureException() + raise def get_all_elbs_v2(**kwargs): @@ -84,18 +96,23 @@ def get_all_elbs_v2(**kwargs): """ elbs = [] - while True: - response = get_elbs_v2(**kwargs) - elbs += response['LoadBalancers'] + try: + while True: + response = get_elbs_v2(**kwargs) + elbs += response['LoadBalancers'] - if not response.get('NextMarker'): - return elbs - else: - kwargs.update(dict(Marker=response['NextMarker'])) + if not response.get('NextMarker'): + return elbs + else: + kwargs.update(dict(Marker=response['NextMarker'])) + except Exception as e: # noqa + metrics.send('get_all_elbs_v2_error', 'counter', 1) + sentry.captureException() + raise @sts_client('elbv2') -@retry(retry_on_exception=retry_throttled, wait_fixed=2000) +@retry(retry_on_exception=retry_throttled, wait_fixed=2000, stop_max_attempt_number=20) def get_listener_arn_from_endpoint(endpoint_name, endpoint_port, **kwargs): """ Get a listener ARN from an endpoint. @@ -103,27 +120,40 @@ def get_listener_arn_from_endpoint(endpoint_name, endpoint_port, **kwargs): :param endpoint_port: :return: """ - client = kwargs.pop('client') - elbs = client.describe_load_balancers(Names=[endpoint_name]) - for elb in elbs['LoadBalancers']: - listeners = client.describe_listeners(LoadBalancerArn=elb['LoadBalancerArn']) - for listener in listeners['Listeners']: - if listener['Port'] == endpoint_port: - return listener['ListenerArn'] + try: + client = kwargs.pop('client') + elbs = client.describe_load_balancers(Names=[endpoint_name]) + for elb in elbs['LoadBalancers']: + listeners = client.describe_listeners(LoadBalancerArn=elb['LoadBalancerArn']) + for listener in listeners['Listeners']: + if listener['Port'] == endpoint_port: + return listener['ListenerArn'] + except Exception as e: # noqa + metrics.send('get_listener_arn_from_endpoint_error', 'counter', 1, + metric_tags={"error": e, "endpoint_name": endpoint_name, "endpoint_port": endpoint_port}) + sentry.captureException(extra={"endpoint_name": str(endpoint_name), + "endpoint_port": str(endpoint_port)}) + raise @sts_client('elb') -@retry(retry_on_exception=retry_throttled, wait_fixed=2000) +@retry(retry_on_exception=retry_throttled, wait_fixed=2000, stop_max_attempt_number=20) def get_elbs(**kwargs): """ Fetches one page elb objects for a given account and region. """ - client = kwargs.pop('client') - return client.describe_load_balancers(**kwargs) + try: + client = kwargs.pop('client') + return client.describe_load_balancers(**kwargs) + except Exception as e: # noqa + metrics.send('get_elbs_error', 'counter', 1, + metric_tags={"error": e}) + sentry.captureException() + raise @sts_client('elbv2') -@retry(retry_on_exception=retry_throttled, wait_fixed=2000) +@retry(retry_on_exception=retry_throttled, wait_fixed=2000, stop_max_attempt_number=20) def get_elbs_v2(**kwargs): """ Fetches one page of elb objects for a given account and region. @@ -131,12 +161,18 @@ def get_elbs_v2(**kwargs): :param kwargs: :return: """ - client = kwargs.pop('client') - return client.describe_load_balancers(**kwargs) + try: + client = kwargs.pop('client') + return client.describe_load_balancers(**kwargs) + except Exception as e: # noqa + metrics.send('get_elbs_v2_error', 'counter', 1, + metric_tags={"error": e}) + sentry.captureException() + raise @sts_client('elbv2') -@retry(retry_on_exception=retry_throttled, wait_fixed=2000) +@retry(retry_on_exception=retry_throttled, wait_fixed=2000, stop_max_attempt_number=20) def describe_listeners_v2(**kwargs): """ Fetches one page of listener objects for a given elb arn. @@ -144,8 +180,14 @@ def describe_listeners_v2(**kwargs): :param kwargs: :return: """ - client = kwargs.pop('client') - return client.describe_listeners(**kwargs) + try: + client = kwargs.pop('client') + return client.describe_listeners(**kwargs) + except Exception as e: # noqa + metrics.send('describe_listeners_v2_error', 'counter', 1, + metric_tags={"error": e}) + sentry.captureException() + raise @sts_client('elb') @@ -157,11 +199,12 @@ def describe_load_balancer_policies(load_balancer_name, policy_names, **kwargs): :param load_balancer_name: :return: """ + try: return kwargs['client'].describe_load_balancer_policies(LoadBalancerName=load_balancer_name, PolicyNames=policy_names) except Exception as e: # noqa - metrics.send('describe_load_balancer_policies_fail', 'counter', 1, + metrics.send('describe_load_balancer_policies_error', 'counter', 1, metric_tags={"load_balancer_name": load_balancer_name, "policy_names": policy_names, "error": e}) sentry.captureException(extra={"load_balancer_name": load_balancer_name, "policy_names": policy_names}) raise @@ -179,14 +222,14 @@ def describe_ssl_policies_v2(policy_names, **kwargs): try: return kwargs['client'].describe_ssl_policies(Names=policy_names) except Exception as e: # noqa - metrics.send('describe_ssl_policies_v2_fail', 'counter', 1, + metrics.send('describe_ssl_policies_v2_error', 'counter', 1, metric_tags={"policy_names": policy_names, "error": e}) sentry.captureException(extra={"policy_names": policy_names}) raise @sts_client('elb') -@retry(retry_on_exception=retry_throttled, wait_fixed=2000) +@retry(retry_on_exception=retry_throttled, wait_fixed=2000, stop_max_attempt_number=20) def describe_load_balancer_types(policies, **kwargs): """ Describe the policies with policy details. @@ -198,7 +241,7 @@ def describe_load_balancer_types(policies, **kwargs): @sts_client('elb') -@retry(retry_on_exception=retry_throttled, wait_fixed=2000) +@retry(retry_on_exception=retry_throttled, wait_fixed=2000, stop_max_attempt_number=20) def attach_certificate(name, port, certificate_id, **kwargs): """ Attaches a certificate to a listener, throws exception @@ -218,7 +261,7 @@ def attach_certificate(name, port, certificate_id, **kwargs): @sts_client('elbv2') -@retry(retry_on_exception=retry_throttled, wait_fixed=2000) +@retry(retry_on_exception=retry_throttled, wait_fixed=2000, stop_max_attempt_number=20) def attach_certificate_v2(listener_arn, port, certificates, **kwargs): """ Attaches a certificate to a listener, throws exception From 1a3ba46873c8437d11829a76f2451feaa0235397 Mon Sep 17 00:00:00 2001 From: Curtis Castrapel Date: Fri, 26 Apr 2019 10:18:54 -0700 Subject: [PATCH 2/2] More retry changes --- lemur/plugins/lemur_aws/elb.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lemur/plugins/lemur_aws/elb.py b/lemur/plugins/lemur_aws/elb.py index 77e99d18..618f75e8 100644 --- a/lemur/plugins/lemur_aws/elb.py +++ b/lemur/plugins/lemur_aws/elb.py @@ -206,7 +206,8 @@ def describe_load_balancer_policies(load_balancer_name, policy_names, **kwargs): except Exception as e: # noqa metrics.send('describe_load_balancer_policies_error', 'counter', 1, metric_tags={"load_balancer_name": load_balancer_name, "policy_names": policy_names, "error": e}) - sentry.captureException(extra={"load_balancer_name": load_balancer_name, "policy_names": policy_names}) + sentry.captureException(extra={"load_balancer_name": str(load_balancer_name), + "policy_names": str(policy_names)}) raise @@ -224,7 +225,7 @@ def describe_ssl_policies_v2(policy_names, **kwargs): except Exception as e: # noqa metrics.send('describe_ssl_policies_v2_error', 'counter', 1, metric_tags={"policy_names": policy_names, "error": e}) - sentry.captureException(extra={"policy_names": policy_names}) + sentry.captureException(extra={"policy_names": str(policy_names)}) raise