Better metrics and error reporting

This commit is contained in:
Curtis Castrapel 2019-04-25 13:50:41 -07:00
parent 272285f64a
commit 2bc604e5a9
3 changed files with 64 additions and 16 deletions

View File

@ -5,7 +5,7 @@ import dns.exception
import dns.name import dns.name
import dns.query import dns.query
import dns.resolver import dns.resolver
from dyn.tm.errors import DynectCreateError, DynectGetError from dyn.tm.errors import DynectCreateError, DynectDeleteError, DynectGetError, DynectUpdateError
from dyn.tm.session import DynectSession from dyn.tm.session import DynectSession
from dyn.tm.zones import Node, Zone, get_all_zones from dyn.tm.zones import Node, Zone, get_all_zones
from flask import current_app from flask import current_app
@ -51,17 +51,23 @@ def _has_dns_propagated(name, token):
def wait_for_dns_change(change_id, account_number=None): def wait_for_dns_change(change_id, account_number=None):
fqdn, token = change_id fqdn, token = change_id
number_of_attempts = 10 number_of_attempts = 20
for attempts in range(0, number_of_attempts): for attempts in range(0, number_of_attempts):
status = _has_dns_propagated(fqdn, token) status = _has_dns_propagated(fqdn, token)
current_app.logger.debug("Record status for fqdn: {}: {}".format(fqdn, status)) current_app.logger.debug("Record status for fqdn: {}: {}".format(fqdn, status))
if status: if status:
metrics.send('wait_for_dns_change_success', 'counter', 1) metrics.send('wait_for_dns_change_success', 'counter', 1)
break break
time.sleep(20) time.sleep(10)
if not status: if not status:
# TODO: Delete associated DNS text record here # TODO: Delete associated DNS text record here
metrics.send('wait_for_dns_change_fail', 'counter', 1) metrics.send('wait_for_dns_change_fail', 'counter', 1)
sentry.captureException(
extra={
"fqdn": fqdn, "txt_record": token}
)
metrics.send('wait_for_dns_change_error', 'counter', 1,
metric_tags={'fqdn': fqdn, 'txt_record': token})
raise Exception("Unable to query DNS token for fqdn {}.".format(fqdn)) raise Exception("Unable to query DNS token for fqdn {}.".format(fqdn))
return return
@ -105,7 +111,7 @@ def create_txt_record(domain, token, account_number):
zone.add_record(node_name, record_type='TXT', txtdata="\"{}\"".format(token), ttl=5) zone.add_record(node_name, record_type='TXT', txtdata="\"{}\"".format(token), ttl=5)
zone.publish() zone.publish()
current_app.logger.debug("TXT record created: {0}, token: {1}".format(fqdn, token)) current_app.logger.debug("TXT record created: {0}, token: {1}".format(fqdn, token))
except DynectCreateError as e: except (DynectCreateError, DynectUpdateError) as e:
if "Cannot duplicate existing record data" in e.message: if "Cannot duplicate existing record data" in e.message:
current_app.logger.debug( current_app.logger.debug(
"Unable to add record. Domain: {}. Token: {}. " "Unable to add record. Domain: {}. Token: {}. "
@ -138,14 +144,33 @@ def delete_txt_record(change_id, account_number, domain, token):
all_txt_records = node.get_all_records_by_type('TXT') all_txt_records = node.get_all_records_by_type('TXT')
except DynectGetError: except DynectGetError:
sentry.captureException() sentry.captureException()
metrics.send('delete_txt_record_error', 'counter', 1) metrics.send('delete_txt_record_geterror', 'counter', 1)
# No Text Records remain or host is not in the zone anymore because all records have been deleted. # No Text Records remain or host is not in the zone anymore because all records have been deleted.
return return
for txt_record in all_txt_records: for txt_record in all_txt_records:
if txt_record.txtdata == ("{}".format(token)): if txt_record.txtdata == ("{}".format(token)):
current_app.logger.debug("Deleting TXT record name: {0}".format(fqdn)) current_app.logger.debug("Deleting TXT record name: {0}".format(fqdn))
txt_record.delete() try:
zone.publish() txt_record.delete()
except DynectDeleteError:
sentry.captureException(
extra={
"fqdn": fqdn, "zone_name": zone_name, "node_name": node_name,
"txt_record": txt_record.txtdata}
)
metrics.send('delete_txt_record_deleteerror', 'counter', 1,
metric_tags={'fqdn': fqdn, 'txt_record': txt_record.txtdata})
try:
zone.publish()
except DynectUpdateError:
sentry.captureException(
extra={
"fqdn": fqdn, "zone_name": zone_name, "node_name": node_name,
"txt_record": txt_record.txtdata}
)
metrics.send('delete_txt_record_publish_error', 'counter', 1,
metric_tags={'fqdn': fqdn, 'txt_record': txt_record.txtdata})
def delete_acme_txt_records(domain): def delete_acme_txt_records(domain):
@ -171,7 +196,16 @@ def delete_acme_txt_records(domain):
all_txt_records = node.get_all_records_by_type('TXT') all_txt_records = node.get_all_records_by_type('TXT')
for txt_record in all_txt_records: for txt_record in all_txt_records:
current_app.logger.debug("Deleting TXT record name: {0}".format(fqdn)) current_app.logger.debug("Deleting TXT record name: {0}".format(fqdn))
txt_record.delete() try:
txt_record.delete()
except DynectDeleteError:
sentry.captureException(
extra={
"fqdn": fqdn, "zone_name": zone_name, "node_name": node_name,
"txt_record": txt_record.txtdata}
)
metrics.send('delete_txt_record_deleteerror', 'counter', 1,
metric_tags={'fqdn': fqdn, 'txt_record': txt_record.txtdata})
zone.publish() zone.publish()

View File

@ -142,9 +142,9 @@ class AcmeHandler(object):
try: try:
orderr = acme_client.finalize_order(order, deadline) orderr = acme_client.finalize_order(order, deadline)
except AcmeError: except AcmeError:
sentry.captureException() sentry.captureException(extra={"order_url": order.uri})
metrics.send('request_certificate_error', 'counter', 1) metrics.send('request_certificate_error', 'counter', 1)
current_app.logger.error(f"Unable to resolve Acme order: {order}", exc_info=True) current_app.logger.error(f"Unable to resolve Acme order: {order.uri}", exc_info=True)
raise raise
pem_certificate = OpenSSL.crypto.dump_certificate(OpenSSL.crypto.FILETYPE_PEM, pem_certificate = OpenSSL.crypto.dump_certificate(OpenSSL.crypto.FILETYPE_PEM,
@ -289,9 +289,10 @@ class AcmeHandler(object):
dns_challenges = authz_record.dns_challenge dns_challenges = authz_record.dns_challenge
host_to_validate = self.maybe_remove_wildcard(authz_record.host) host_to_validate = self.maybe_remove_wildcard(authz_record.host)
host_to_validate = self.maybe_add_extension(host_to_validate, dns_provider_options) host_to_validate = self.maybe_add_extension(host_to_validate, dns_provider_options)
dns_provider_plugin = self.get_dns_provider(dns_provider.provider_type)
for dns_challenge in dns_challenges: for dns_challenge in dns_challenges:
try: try:
dns_provider.delete_txt_record( dns_provider_plugin.delete_txt_record(
authz_record.change_id, authz_record.change_id,
account_number, account_number,
dns_challenge.validation_domain_name(host_to_validate), dns_challenge.validation_domain_name(host_to_validate),

View File

@ -10,7 +10,7 @@ from flask import current_app
from retrying import retry from retrying import retry
from lemur.extensions import metrics from lemur.extensions import metrics, sentry
from lemur.exceptions import InvalidListener from lemur.exceptions import InvalidListener
from lemur.plugins.lemur_aws.sts import sts_client from lemur.plugins.lemur_aws.sts import sts_client
@ -149,7 +149,7 @@ def describe_listeners_v2(**kwargs):
@sts_client('elb') @sts_client('elb')
@retry(retry_on_exception=retry_throttled, wait_fixed=2000) @retry(retry_on_exception=retry_throttled, wait_fixed=2000, stop_max_attempt_number=20)
def describe_load_balancer_policies(load_balancer_name, policy_names, **kwargs): def describe_load_balancer_policies(load_balancer_name, policy_names, **kwargs):
""" """
Fetching all policies currently associated with an ELB. Fetching all policies currently associated with an ELB.
@ -157,11 +157,18 @@ def describe_load_balancer_policies(load_balancer_name, policy_names, **kwargs):
:param load_balancer_name: :param load_balancer_name:
:return: :return:
""" """
return kwargs['client'].describe_load_balancer_policies(LoadBalancerName=load_balancer_name, PolicyNames=policy_names) try:
return kwargs['client'].describe_load_balancer_policies(LoadBalancerName=load_balancer_name,
PolicyNames=policy_names)
except Exception as e: # noqa
metrics.send('describe_load_balancer_policies_fail', 'counter', 1,
metric_tags={"load_balancer_name": load_balancer_name, "policy_names": policy_names, "error": e})
sentry.captureException(extra={"load_balancer_name": load_balancer_name, "policy_names": policy_names})
raise
@sts_client('elbv2') @sts_client('elbv2')
@retry(retry_on_exception=retry_throttled, wait_fixed=2000) @retry(retry_on_exception=retry_throttled, wait_fixed=2000, stop_max_attempt_number=20)
def describe_ssl_policies_v2(policy_names, **kwargs): def describe_ssl_policies_v2(policy_names, **kwargs):
""" """
Fetching all policies currently associated with an ELB. Fetching all policies currently associated with an ELB.
@ -169,7 +176,13 @@ def describe_ssl_policies_v2(policy_names, **kwargs):
:param policy_names: :param policy_names:
:return: :return:
""" """
return kwargs['client'].describe_ssl_policies(Names=policy_names) try:
return kwargs['client'].describe_ssl_policies(Names=policy_names)
except Exception as e: # noqa
metrics.send('describe_ssl_policies_v2_fail', 'counter', 1,
metric_tags={"policy_names": policy_names, "error": e})
sentry.captureException(extra={"policy_names": policy_names})
raise
@sts_client('elb') @sts_client('elb')