Better metrics and error reporting

This commit is contained in:
Curtis Castrapel 2019-04-25 13:50:41 -07:00
parent 272285f64a
commit 2bc604e5a9
3 changed files with 64 additions and 16 deletions

View File

@ -5,7 +5,7 @@ import dns.exception
import dns.name
import dns.query
import dns.resolver
from dyn.tm.errors import DynectCreateError, DynectGetError
from dyn.tm.errors import DynectCreateError, DynectDeleteError, DynectGetError, DynectUpdateError
from dyn.tm.session import DynectSession
from dyn.tm.zones import Node, Zone, get_all_zones
from flask import current_app
@ -51,17 +51,23 @@ def _has_dns_propagated(name, token):
def wait_for_dns_change(change_id, account_number=None):
fqdn, token = change_id
number_of_attempts = 10
number_of_attempts = 20
for attempts in range(0, number_of_attempts):
status = _has_dns_propagated(fqdn, token)
current_app.logger.debug("Record status for fqdn: {}: {}".format(fqdn, status))
if status:
metrics.send('wait_for_dns_change_success', 'counter', 1)
break
time.sleep(20)
time.sleep(10)
if not status:
# TODO: Delete associated DNS text record here
metrics.send('wait_for_dns_change_fail', 'counter', 1)
sentry.captureException(
extra={
"fqdn": fqdn, "txt_record": token}
)
metrics.send('wait_for_dns_change_error', 'counter', 1,
metric_tags={'fqdn': fqdn, 'txt_record': token})
raise Exception("Unable to query DNS token for fqdn {}.".format(fqdn))
return
@ -105,7 +111,7 @@ def create_txt_record(domain, token, account_number):
zone.add_record(node_name, record_type='TXT', txtdata="\"{}\"".format(token), ttl=5)
zone.publish()
current_app.logger.debug("TXT record created: {0}, token: {1}".format(fqdn, token))
except DynectCreateError as e:
except (DynectCreateError, DynectUpdateError) as e:
if "Cannot duplicate existing record data" in e.message:
current_app.logger.debug(
"Unable to add record. Domain: {}. Token: {}. "
@ -138,14 +144,33 @@ def delete_txt_record(change_id, account_number, domain, token):
all_txt_records = node.get_all_records_by_type('TXT')
except DynectGetError:
sentry.captureException()
metrics.send('delete_txt_record_error', 'counter', 1)
metrics.send('delete_txt_record_geterror', 'counter', 1)
# No Text Records remain or host is not in the zone anymore because all records have been deleted.
return
for txt_record in all_txt_records:
if txt_record.txtdata == ("{}".format(token)):
current_app.logger.debug("Deleting TXT record name: {0}".format(fqdn))
txt_record.delete()
zone.publish()
try:
txt_record.delete()
except DynectDeleteError:
sentry.captureException(
extra={
"fqdn": fqdn, "zone_name": zone_name, "node_name": node_name,
"txt_record": txt_record.txtdata}
)
metrics.send('delete_txt_record_deleteerror', 'counter', 1,
metric_tags={'fqdn': fqdn, 'txt_record': txt_record.txtdata})
try:
zone.publish()
except DynectUpdateError:
sentry.captureException(
extra={
"fqdn": fqdn, "zone_name": zone_name, "node_name": node_name,
"txt_record": txt_record.txtdata}
)
metrics.send('delete_txt_record_publish_error', 'counter', 1,
metric_tags={'fqdn': fqdn, 'txt_record': txt_record.txtdata})
def delete_acme_txt_records(domain):
@ -171,7 +196,16 @@ def delete_acme_txt_records(domain):
all_txt_records = node.get_all_records_by_type('TXT')
for txt_record in all_txt_records:
current_app.logger.debug("Deleting TXT record name: {0}".format(fqdn))
txt_record.delete()
try:
txt_record.delete()
except DynectDeleteError:
sentry.captureException(
extra={
"fqdn": fqdn, "zone_name": zone_name, "node_name": node_name,
"txt_record": txt_record.txtdata}
)
metrics.send('delete_txt_record_deleteerror', 'counter', 1,
metric_tags={'fqdn': fqdn, 'txt_record': txt_record.txtdata})
zone.publish()

View File

@ -142,9 +142,9 @@ class AcmeHandler(object):
try:
orderr = acme_client.finalize_order(order, deadline)
except AcmeError:
sentry.captureException()
sentry.captureException(extra={"order_url": order.uri})
metrics.send('request_certificate_error', 'counter', 1)
current_app.logger.error(f"Unable to resolve Acme order: {order}", exc_info=True)
current_app.logger.error(f"Unable to resolve Acme order: {order.uri}", exc_info=True)
raise
pem_certificate = OpenSSL.crypto.dump_certificate(OpenSSL.crypto.FILETYPE_PEM,
@ -289,9 +289,10 @@ class AcmeHandler(object):
dns_challenges = authz_record.dns_challenge
host_to_validate = self.maybe_remove_wildcard(authz_record.host)
host_to_validate = self.maybe_add_extension(host_to_validate, dns_provider_options)
dns_provider_plugin = self.get_dns_provider(dns_provider.provider_type)
for dns_challenge in dns_challenges:
try:
dns_provider.delete_txt_record(
dns_provider_plugin.delete_txt_record(
authz_record.change_id,
account_number,
dns_challenge.validation_domain_name(host_to_validate),

View File

@ -10,7 +10,7 @@ from flask import current_app
from retrying import retry
from lemur.extensions import metrics
from lemur.extensions import metrics, sentry
from lemur.exceptions import InvalidListener
from lemur.plugins.lemur_aws.sts import sts_client
@ -149,7 +149,7 @@ def describe_listeners_v2(**kwargs):
@sts_client('elb')
@retry(retry_on_exception=retry_throttled, wait_fixed=2000)
@retry(retry_on_exception=retry_throttled, wait_fixed=2000, stop_max_attempt_number=20)
def describe_load_balancer_policies(load_balancer_name, policy_names, **kwargs):
"""
Fetching all policies currently associated with an ELB.
@ -157,11 +157,18 @@ def describe_load_balancer_policies(load_balancer_name, policy_names, **kwargs):
:param load_balancer_name:
:return:
"""
return kwargs['client'].describe_load_balancer_policies(LoadBalancerName=load_balancer_name, PolicyNames=policy_names)
try:
return kwargs['client'].describe_load_balancer_policies(LoadBalancerName=load_balancer_name,
PolicyNames=policy_names)
except Exception as e: # noqa
metrics.send('describe_load_balancer_policies_fail', 'counter', 1,
metric_tags={"load_balancer_name": load_balancer_name, "policy_names": policy_names, "error": e})
sentry.captureException(extra={"load_balancer_name": load_balancer_name, "policy_names": policy_names})
raise
@sts_client('elbv2')
@retry(retry_on_exception=retry_throttled, wait_fixed=2000)
@retry(retry_on_exception=retry_throttled, wait_fixed=2000, stop_max_attempt_number=20)
def describe_ssl_policies_v2(policy_names, **kwargs):
"""
Fetching all policies currently associated with an ELB.
@ -169,7 +176,13 @@ def describe_ssl_policies_v2(policy_names, **kwargs):
:param policy_names:
:return:
"""
return kwargs['client'].describe_ssl_policies(Names=policy_names)
try:
return kwargs['client'].describe_ssl_policies(Names=policy_names)
except Exception as e: # noqa
metrics.send('describe_ssl_policies_v2_fail', 'counter', 1,
metric_tags={"policy_names": policy_names, "error": e})
sentry.captureException(extra={"policy_names": policy_names})
raise
@sts_client('elb')