From 272285f64a030d2dd396ac8689d147a95561b8d0 Mon Sep 17 00:00:00 2001 From: Curtis Castrapel Date: Wed, 24 Apr 2019 15:26:23 -0700 Subject: [PATCH] Better exception handling, logging, and metrics for ACME flow --- lemur/plugins/lemur_acme/dyn.py | 28 +++++++++++++---- lemur/plugins/lemur_acme/plugin.py | 48 +++++++++++++++++++++++++----- 2 files changed, 63 insertions(+), 13 deletions(-) diff --git a/lemur/plugins/lemur_acme/dyn.py b/lemur/plugins/lemur_acme/dyn.py index 5d419f7f..232c6ca3 100644 --- a/lemur/plugins/lemur_acme/dyn.py +++ b/lemur/plugins/lemur_acme/dyn.py @@ -10,13 +10,21 @@ from dyn.tm.session import DynectSession from dyn.tm.zones import Node, Zone, get_all_zones from flask import current_app +from lemur.extensions import metrics, sentry + def get_dynect_session(): - dynect_session = DynectSession( - current_app.config.get('ACME_DYN_CUSTOMER_NAME', ''), - current_app.config.get('ACME_DYN_USERNAME', ''), - current_app.config.get('ACME_DYN_PASSWORD', ''), - ) + try: + dynect_session = DynectSession( + current_app.config.get('ACME_DYN_CUSTOMER_NAME', ''), + current_app.config.get('ACME_DYN_USERNAME', ''), + current_app.config.get('ACME_DYN_PASSWORD', ''), + ) + except Exception as e: + sentry.captureException() + metrics.send('get_dynect_session_fail', 'counter', 1) + current_app.logger.debug("Unable to establish connection to Dyn", exc_info=True) + raise return dynect_session @@ -30,10 +38,12 @@ def _has_dns_propagated(name, token): for txt_record in rdata.strings: txt_records.append(txt_record.decode("utf-8")) except dns.exception.DNSException: + metrics.send('has_dns_propagated_fail', 'counter', 1) return False for txt_record in txt_records: if txt_record == token: + metrics.send('has_dns_propagated_success', 'counter', 1) return True return False @@ -46,10 +56,12 @@ def wait_for_dns_change(change_id, account_number=None): status = _has_dns_propagated(fqdn, token) current_app.logger.debug("Record status for fqdn: {}: {}".format(fqdn, status)) if status: + metrics.send('wait_for_dns_change_success', 'counter', 1) break time.sleep(20) if not status: # TODO: Delete associated DNS text record here + metrics.send('wait_for_dns_change_fail', 'counter', 1) raise Exception("Unable to query DNS token for fqdn {}.".format(fqdn)) return @@ -67,6 +79,7 @@ def get_zone_name(domain): if z.name.count(".") > zone_name.count("."): zone_name = z.name if not zone_name: + metrics.send('dyn_no_zone_name', 'counter', 1) raise Exception("No Dyn zone found for domain: {}".format(domain)) return zone_name @@ -99,6 +112,8 @@ def create_txt_record(domain, token, account_number): "Record already exists: {}".format(domain, token, e), exc_info=True ) else: + metrics.send('create_txt_record_error', 'counter', 1) + sentry.captureException() raise change_id = (fqdn, token) @@ -122,6 +137,8 @@ def delete_txt_record(change_id, account_number, domain, token): try: all_txt_records = node.get_all_records_by_type('TXT') except DynectGetError: + sentry.captureException() + metrics.send('delete_txt_record_error', 'counter', 1) # No Text Records remain or host is not in the zone anymore because all records have been deleted. return for txt_record in all_txt_records: @@ -178,6 +195,7 @@ def get_authoritative_nameserver(domain): rcode = response.rcode() if rcode != dns.rcode.NOERROR: + metrics.send('get_authoritative_nameserver_error', 'counter', 1) if rcode == dns.rcode.NXDOMAIN: raise Exception('%s does not exist.' % sub) else: diff --git a/lemur/plugins/lemur_acme/plugin.py b/lemur/plugins/lemur_acme/plugin.py index 59cde380..c998c0b8 100644 --- a/lemur/plugins/lemur_acme/plugin.py +++ b/lemur/plugins/lemur_acme/plugin.py @@ -28,6 +28,7 @@ from lemur.authorizations import service as authorization_service from lemur.common.utils import generate_private_key from lemur.dns_providers import service as dns_provider_service from lemur.exceptions import InvalidAuthority, InvalidConfiguration, UnknownProvider +from lemur.extensions import metrics, sentry from lemur.plugins import lemur_acme as acme from lemur.plugins.bases import IssuerPlugin from lemur.plugins.lemur_acme import cloudflare, dyn, route53 @@ -47,7 +48,9 @@ class AcmeHandler(object): try: self.all_dns_providers = dns_provider_service.get_all_dns_providers() except Exception as e: - current_app.logger.error("Unable to fetch DNS Providers: {}".format(e)) + metrics.send('AcmeHandler_init_error', 'counter', 1) + sentry.captureException() + current_app.logger.error(f"Unable to fetch DNS Providers: {e}") self.all_dns_providers = [] def find_dns_challenge(self, authorizations): @@ -94,6 +97,7 @@ class AcmeHandler(object): current_app.logger.debug("Finalizing DNS challenge for {0}".format(authz_record.authz[0].body.identifier.value)) dns_providers = self.dns_providers_for_domain.get(authz_record.host) if not dns_providers: + metrics.send('complete_dns_challenge_error_no_dnsproviders', 'counter', 1) raise Exception("No DNS providers found for domain: {}".format(authz_record.host)) for dns_provider in dns_providers: @@ -102,7 +106,15 @@ class AcmeHandler(object): account_number = dns_provider_options.get("account_id") dns_provider_plugin = self.get_dns_provider(dns_provider.provider_type) for change_id in authz_record.change_id: - dns_provider_plugin.wait_for_dns_change(change_id, account_number=account_number) + try: + dns_provider_plugin.wait_for_dns_change(change_id, account_number=account_number) + except Exception: + metrics.send('complete_dns_challenge_error', 'counter', 1) + sentry.captureException() + current_app.logger.debug( + f"Unable to resolve DNS challenge for change_id: {change_id}, account_id: " + f"{account_number}", exc_info=True) + raise for dns_challenge in authz_record.dns_challenge: response = dns_challenge.response(acme_client.client.net.key) @@ -114,6 +126,7 @@ class AcmeHandler(object): ) if not verified: + metrics.send('complete_dns_challenge_verification_error', 'counter', 1) raise ValueError("Failed verification") time.sleep(5) @@ -129,7 +142,9 @@ class AcmeHandler(object): try: orderr = acme_client.finalize_order(order, deadline) except AcmeError: - current_app.logger.error("Unable to resolve Acme order: {}".format(order), exc_info=True) + sentry.captureException() + metrics.send('request_certificate_error', 'counter', 1) + current_app.logger.error(f"Unable to resolve Acme order: {order}", exc_info=True) raise pem_certificate = OpenSSL.crypto.dump_certificate(OpenSSL.crypto.FILETYPE_PEM, @@ -196,6 +211,7 @@ class AcmeHandler(object): for domain in order_info.domains: if not self.dns_providers_for_domain.get(domain): + metrics.send('get_authorizations_no_dns_provider_for_domain', 'counter', 1) raise Exception("No DNS providers found for domain: {}".format(domain)) for dns_provider in self.dns_providers_for_domain[domain]: dns_provider_plugin = self.get_dns_provider(dns_provider.provider_type) @@ -284,6 +300,8 @@ class AcmeHandler(object): except Exception as e: # If this fails, it's most likely because the record doesn't exist (It was already cleaned up) # or we're not authorized to modify it. + metrics.send('cleanup_dns_challenges_error', 'counter', 1) + sentry.captureException() pass def get_dns_provider(self, type): @@ -378,12 +396,15 @@ class ACMEIssuerPlugin(IssuerPlugin): try: order = acme_client.new_order(pending_cert.csr) except WildcardUnsupportedError: + metrics.send('get_ordered_certificate_wildcard_unsupported', 'counter', 1) raise Exception("The currently selected ACME CA endpoint does" " not support issuing wildcard certificates.") try: authorizations = self.acme.get_authorizations(acme_client, order, order_info) except ClientError: - current_app.logger.error("Unable to resolve pending cert: {}".format(pending_cert.name), exc_info=True) + sentry.captureException() + metrics.send('get_ordered_certificate_error', 'counter', 1) + current_app.logger.error(f"Unable to resolve pending cert: {pending_cert.name}", exc_info=True) return False authorizations = self.acme.finalize_authorizations(acme_client, authorizations) @@ -418,6 +439,8 @@ class ACMEIssuerPlugin(IssuerPlugin): try: order = acme_client.new_order(pending_cert.csr) except WildcardUnsupportedError: + sentry.captureException() + metrics.send('get_ordered_certificates_wildcard_unsupported_error', 'counter', 1) raise Exception("The currently selected ACME CA endpoint does" " not support issuing wildcard certificates.") @@ -430,7 +453,13 @@ class ACMEIssuerPlugin(IssuerPlugin): "order": order, }) except (ClientError, ValueError, Exception) as e: - current_app.logger.error("Unable to resolve pending cert: {}".format(pending_cert), exc_info=True) + sentry.captureException() + metrics.send('get_ordered_certificates_pending_creation_error', 'counter', 1) + current_app.logger.error(f"Unable to resolve pending cert: {pending_cert}", exc_info=True) + + error = e + if globals().get("order") and order: + error += f" Order uri: {order.uri}" certs.append({ "cert": False, "pending_cert": pending_cert, @@ -459,14 +488,17 @@ class ACMEIssuerPlugin(IssuerPlugin): "pending_cert": entry["pending_cert"], }) except (PollError, AcmeError, Exception) as e: + sentry.captureException() + metrics.send('get_ordered_certificates_resolution_error', 'counter', 1) order_url = order.uri + error = f"{e}. Order URI: {order_url}" current_app.logger.error( - "Unable to resolve pending cert: {}. " - "Check out {} for more information.".format(pending_cert, order_url), exc_info=True) + f"Unable to resolve pending cert: {pending_cert}. " + f"Check out {order_url} for more information.", exc_info=True) certs.append({ "cert": False, "pending_cert": entry["pending_cert"], - "last_error": e, + "last_error": error, }) # Ensure DNS records get deleted self.acme.cleanup_dns_challenges(