Merge pull request #2763 from castrapel/metrics_errors_acme

Better exception handling, logging, and metrics for ACME flow
This commit is contained in:
Curtis 2019-04-24 15:33:11 -07:00 committed by GitHub
commit e313b74813
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 63 additions and 13 deletions

View File

@ -10,13 +10,21 @@ from dyn.tm.session import DynectSession
from dyn.tm.zones import Node, Zone, get_all_zones
from flask import current_app
from lemur.extensions import metrics, sentry
def get_dynect_session():
dynect_session = DynectSession(
current_app.config.get('ACME_DYN_CUSTOMER_NAME', ''),
current_app.config.get('ACME_DYN_USERNAME', ''),
current_app.config.get('ACME_DYN_PASSWORD', ''),
)
try:
dynect_session = DynectSession(
current_app.config.get('ACME_DYN_CUSTOMER_NAME', ''),
current_app.config.get('ACME_DYN_USERNAME', ''),
current_app.config.get('ACME_DYN_PASSWORD', ''),
)
except Exception as e:
sentry.captureException()
metrics.send('get_dynect_session_fail', 'counter', 1)
current_app.logger.debug("Unable to establish connection to Dyn", exc_info=True)
raise
return dynect_session
@ -30,10 +38,12 @@ def _has_dns_propagated(name, token):
for txt_record in rdata.strings:
txt_records.append(txt_record.decode("utf-8"))
except dns.exception.DNSException:
metrics.send('has_dns_propagated_fail', 'counter', 1)
return False
for txt_record in txt_records:
if txt_record == token:
metrics.send('has_dns_propagated_success', 'counter', 1)
return True
return False
@ -46,10 +56,12 @@ def wait_for_dns_change(change_id, account_number=None):
status = _has_dns_propagated(fqdn, token)
current_app.logger.debug("Record status for fqdn: {}: {}".format(fqdn, status))
if status:
metrics.send('wait_for_dns_change_success', 'counter', 1)
break
time.sleep(20)
if not status:
# TODO: Delete associated DNS text record here
metrics.send('wait_for_dns_change_fail', 'counter', 1)
raise Exception("Unable to query DNS token for fqdn {}.".format(fqdn))
return
@ -67,6 +79,7 @@ def get_zone_name(domain):
if z.name.count(".") > zone_name.count("."):
zone_name = z.name
if not zone_name:
metrics.send('dyn_no_zone_name', 'counter', 1)
raise Exception("No Dyn zone found for domain: {}".format(domain))
return zone_name
@ -99,6 +112,8 @@ def create_txt_record(domain, token, account_number):
"Record already exists: {}".format(domain, token, e), exc_info=True
)
else:
metrics.send('create_txt_record_error', 'counter', 1)
sentry.captureException()
raise
change_id = (fqdn, token)
@ -122,6 +137,8 @@ def delete_txt_record(change_id, account_number, domain, token):
try:
all_txt_records = node.get_all_records_by_type('TXT')
except DynectGetError:
sentry.captureException()
metrics.send('delete_txt_record_error', 'counter', 1)
# No Text Records remain or host is not in the zone anymore because all records have been deleted.
return
for txt_record in all_txt_records:
@ -178,6 +195,7 @@ def get_authoritative_nameserver(domain):
rcode = response.rcode()
if rcode != dns.rcode.NOERROR:
metrics.send('get_authoritative_nameserver_error', 'counter', 1)
if rcode == dns.rcode.NXDOMAIN:
raise Exception('%s does not exist.' % sub)
else:

View File

@ -28,6 +28,7 @@ from lemur.authorizations import service as authorization_service
from lemur.common.utils import generate_private_key
from lemur.dns_providers import service as dns_provider_service
from lemur.exceptions import InvalidAuthority, InvalidConfiguration, UnknownProvider
from lemur.extensions import metrics, sentry
from lemur.plugins import lemur_acme as acme
from lemur.plugins.bases import IssuerPlugin
from lemur.plugins.lemur_acme import cloudflare, dyn, route53
@ -47,7 +48,9 @@ class AcmeHandler(object):
try:
self.all_dns_providers = dns_provider_service.get_all_dns_providers()
except Exception as e:
current_app.logger.error("Unable to fetch DNS Providers: {}".format(e))
metrics.send('AcmeHandler_init_error', 'counter', 1)
sentry.captureException()
current_app.logger.error(f"Unable to fetch DNS Providers: {e}")
self.all_dns_providers = []
def find_dns_challenge(self, authorizations):
@ -94,6 +97,7 @@ class AcmeHandler(object):
current_app.logger.debug("Finalizing DNS challenge for {0}".format(authz_record.authz[0].body.identifier.value))
dns_providers = self.dns_providers_for_domain.get(authz_record.host)
if not dns_providers:
metrics.send('complete_dns_challenge_error_no_dnsproviders', 'counter', 1)
raise Exception("No DNS providers found for domain: {}".format(authz_record.host))
for dns_provider in dns_providers:
@ -102,7 +106,15 @@ class AcmeHandler(object):
account_number = dns_provider_options.get("account_id")
dns_provider_plugin = self.get_dns_provider(dns_provider.provider_type)
for change_id in authz_record.change_id:
dns_provider_plugin.wait_for_dns_change(change_id, account_number=account_number)
try:
dns_provider_plugin.wait_for_dns_change(change_id, account_number=account_number)
except Exception:
metrics.send('complete_dns_challenge_error', 'counter', 1)
sentry.captureException()
current_app.logger.debug(
f"Unable to resolve DNS challenge for change_id: {change_id}, account_id: "
f"{account_number}", exc_info=True)
raise
for dns_challenge in authz_record.dns_challenge:
response = dns_challenge.response(acme_client.client.net.key)
@ -114,6 +126,7 @@ class AcmeHandler(object):
)
if not verified:
metrics.send('complete_dns_challenge_verification_error', 'counter', 1)
raise ValueError("Failed verification")
time.sleep(5)
@ -129,7 +142,9 @@ class AcmeHandler(object):
try:
orderr = acme_client.finalize_order(order, deadline)
except AcmeError:
current_app.logger.error("Unable to resolve Acme order: {}".format(order), exc_info=True)
sentry.captureException()
metrics.send('request_certificate_error', 'counter', 1)
current_app.logger.error(f"Unable to resolve Acme order: {order}", exc_info=True)
raise
pem_certificate = OpenSSL.crypto.dump_certificate(OpenSSL.crypto.FILETYPE_PEM,
@ -196,6 +211,7 @@ class AcmeHandler(object):
for domain in order_info.domains:
if not self.dns_providers_for_domain.get(domain):
metrics.send('get_authorizations_no_dns_provider_for_domain', 'counter', 1)
raise Exception("No DNS providers found for domain: {}".format(domain))
for dns_provider in self.dns_providers_for_domain[domain]:
dns_provider_plugin = self.get_dns_provider(dns_provider.provider_type)
@ -284,6 +300,8 @@ class AcmeHandler(object):
except Exception as e:
# If this fails, it's most likely because the record doesn't exist (It was already cleaned up)
# or we're not authorized to modify it.
metrics.send('cleanup_dns_challenges_error', 'counter', 1)
sentry.captureException()
pass
def get_dns_provider(self, type):
@ -378,12 +396,15 @@ class ACMEIssuerPlugin(IssuerPlugin):
try:
order = acme_client.new_order(pending_cert.csr)
except WildcardUnsupportedError:
metrics.send('get_ordered_certificate_wildcard_unsupported', 'counter', 1)
raise Exception("The currently selected ACME CA endpoint does"
" not support issuing wildcard certificates.")
try:
authorizations = self.acme.get_authorizations(acme_client, order, order_info)
except ClientError:
current_app.logger.error("Unable to resolve pending cert: {}".format(pending_cert.name), exc_info=True)
sentry.captureException()
metrics.send('get_ordered_certificate_error', 'counter', 1)
current_app.logger.error(f"Unable to resolve pending cert: {pending_cert.name}", exc_info=True)
return False
authorizations = self.acme.finalize_authorizations(acme_client, authorizations)
@ -418,6 +439,8 @@ class ACMEIssuerPlugin(IssuerPlugin):
try:
order = acme_client.new_order(pending_cert.csr)
except WildcardUnsupportedError:
sentry.captureException()
metrics.send('get_ordered_certificates_wildcard_unsupported_error', 'counter', 1)
raise Exception("The currently selected ACME CA endpoint does"
" not support issuing wildcard certificates.")
@ -430,7 +453,13 @@ class ACMEIssuerPlugin(IssuerPlugin):
"order": order,
})
except (ClientError, ValueError, Exception) as e:
current_app.logger.error("Unable to resolve pending cert: {}".format(pending_cert), exc_info=True)
sentry.captureException()
metrics.send('get_ordered_certificates_pending_creation_error', 'counter', 1)
current_app.logger.error(f"Unable to resolve pending cert: {pending_cert}", exc_info=True)
error = e
if globals().get("order") and order:
error += f" Order uri: {order.uri}"
certs.append({
"cert": False,
"pending_cert": pending_cert,
@ -459,14 +488,17 @@ class ACMEIssuerPlugin(IssuerPlugin):
"pending_cert": entry["pending_cert"],
})
except (PollError, AcmeError, Exception) as e:
sentry.captureException()
metrics.send('get_ordered_certificates_resolution_error', 'counter', 1)
order_url = order.uri
error = f"{e}. Order URI: {order_url}"
current_app.logger.error(
"Unable to resolve pending cert: {}. "
"Check out {} for more information.".format(pending_cert, order_url), exc_info=True)
f"Unable to resolve pending cert: {pending_cert}. "
f"Check out {order_url} for more information.", exc_info=True)
certs.append({
"cert": False,
"pending_cert": entry["pending_cert"],
"last_error": e,
"last_error": error,
})
# Ensure DNS records get deleted
self.acme.cleanup_dns_challenges(