1
0
mirror of https://blitiri.com.ar/repos/chasquid synced 2025-12-18 14:47:03 +00:00

smtp: Try all entries in MX, not just the first one

Currently, we pick the first host in the MX list, and attempt delivery
there. If it fails, we just report the failure to the queue, which will
wait for some time and then try again.

This is not ideal: we should fall back to the other MXs in the list, as
the first host could be having issues for a long time, and not
attempting with the rest just delays delivery.

This patch implements the fallback, so we try all MXs before deciding to
report a failed delivery (unless, of course, an MX returned a permanent
failure).
This commit is contained in:
Alberto Bertogli
2017-02-26 03:26:07 +00:00
parent c2ea8a8ef0
commit b8551729db
2 changed files with 111 additions and 59 deletions

View File

@@ -27,7 +27,7 @@ var (
"port to use for outgoing SMTP connections, ONLY FOR TESTING") "port to use for outgoing SMTP connections, ONLY FOR TESTING")
// Fake MX records, used for testing only. // Fake MX records, used for testing only.
fakeMX = map[string]string{} fakeMX = map[string][]string{}
) )
// Exported variables. // Exported variables.
@@ -42,31 +42,72 @@ type SMTP struct {
} }
func (s *SMTP) Deliver(from string, to string, data []byte) (error, bool) { func (s *SMTP) Deliver(from string, to string, data []byte) (error, bool) {
tr := trace.New("Courier.SMTP", to) a := &attempt{
defer tr.Finish() courier: s,
tr.Debugf("%s -> %s", from, to) from: from,
to: to,
data: data,
tr: trace.New("Courier.SMTP", to),
}
defer a.tr.Finish()
a.tr.Debugf("%s -> %s", from, to)
// smtp.Client.Mail will add the <> for us when the address is empty.
if a.from == "<>" {
a.from = ""
}
toDomain := envelope.DomainOf(to) toDomain := envelope.DomainOf(to)
mx, err := lookupMX(tr, toDomain) mxs, err := lookupMXs(a.tr, toDomain)
if err != nil { if err != nil {
// Note this is considered a permanent error. // Note this is considered a permanent error.
// This is in line with what other servers (Exim) do. However, the // This is in line with what other servers (Exim) do. However, the
// downside is that temporary DNS issues can affect delivery, so we // downside is that temporary DNS issues can affect delivery, so we
// have to make sure we try hard enough on the lookup above. // have to make sure we try hard enough on the lookup above.
return tr.Errorf("Could not find mail server: %v", err), true return a.tr.Errorf("Could not find mail server: %v", err), true
} }
// Issue an EHLO with a valid domain; otherwise, some servers like postfix // Issue an EHLO with a valid domain; otherwise, some servers like postfix
// will complain. // will complain.
helloDomain, err := idna.ToASCII(envelope.DomainOf(from)) a.helloDomain, err = idna.ToASCII(envelope.DomainOf(from))
if err != nil { if err != nil {
return tr.Errorf("Sender domain not IDNA compliant: %v", err), true return a.tr.Errorf("Sender domain not IDNA compliant: %v", err), true
} }
if helloDomain == "" { if a.helloDomain == "" {
// This can happen when sending bounces. Last resort. // This can happen when sending bounces. Last resort.
helloDomain, _ = os.Hostname() a.helloDomain, _ = os.Hostname()
} }
for _, mx := range mxs {
var permanent bool
err, permanent = a.deliver(mx)
if err == nil {
return nil, false
}
if permanent {
return err, true
}
a.tr.Errorf("%q returned transient error: %v", mx, err)
}
// We exhausted all MXs failed to deliver, try again later.
return a.tr.Errorf("all MXs returned transient failures (last: %v)", err), false
}
type attempt struct {
courier *SMTP
from string
to string
data []byte
toDomain string
helloDomain string
tr *trace.Trace
}
func (a *attempt) deliver(mx string) (error, bool) {
// Do we use insecure TLS? // Do we use insecure TLS?
// Set as fallback when retrying. // Set as fallback when retrying.
insecure := false insecure := false
@@ -75,18 +116,18 @@ func (s *SMTP) Deliver(from string, to string, data []byte) (error, bool) {
retry: retry:
conn, err := net.DialTimeout("tcp", mx+":"+*smtpPort, smtpDialTimeout) conn, err := net.DialTimeout("tcp", mx+":"+*smtpPort, smtpDialTimeout)
if err != nil { if err != nil {
return tr.Errorf("Could not dial: %v", err), false return a.tr.Errorf("Could not dial: %v", err), false
} }
defer conn.Close() defer conn.Close()
conn.SetDeadline(time.Now().Add(smtpTotalTimeout)) conn.SetDeadline(time.Now().Add(smtpTotalTimeout))
c, err := smtp.NewClient(conn, mx) c, err := smtp.NewClient(conn, mx)
if err != nil { if err != nil {
return tr.Errorf("Error creating client: %v", err), false return a.tr.Errorf("Error creating client: %v", err), false
} }
if err = c.Hello(helloDomain); err != nil { if err = c.Hello(a.helloDomain); err != nil {
return tr.Errorf("Error saying hello: %v", err), false return a.tr.Errorf("Error saying hello: %v", err), false
} }
if ok, _ := c.Extension("STARTTLS"); ok { if ok, _ := c.Extension("STARTTLS"); ok {
@@ -100,101 +141,110 @@ retry:
// fail verification we just try again without validating. // fail verification we just try again without validating.
if insecure { if insecure {
tlsCount.Add("tls:failed", 1) tlsCount.Add("tls:failed", 1)
return tr.Errorf("TLS error: %v", err), false return a.tr.Errorf("TLS error: %v", err), false
} }
insecure = true insecure = true
tr.Debugf("TLS error, retrying insecurely") a.tr.Debugf("TLS error, retrying insecurely")
goto retry goto retry
} }
if config.InsecureSkipVerify { if config.InsecureSkipVerify {
tr.Debugf("Insecure - using TLS, but cert does not match %s", mx) a.tr.Debugf("Insecure - using TLS, but cert does not match %s", mx)
tlsCount.Add("tls:insecure", 1) tlsCount.Add("tls:insecure", 1)
secLevel = domaininfo.SecLevel_TLS_INSECURE secLevel = domaininfo.SecLevel_TLS_INSECURE
} else { } else {
tlsCount.Add("tls:secure", 1) tlsCount.Add("tls:secure", 1)
tr.Debugf("Secure - using TLS") a.tr.Debugf("Secure - using TLS")
secLevel = domaininfo.SecLevel_TLS_SECURE secLevel = domaininfo.SecLevel_TLS_SECURE
} }
} else { } else {
tlsCount.Add("plain", 1) tlsCount.Add("plain", 1)
tr.Debugf("Insecure - NOT using TLS") a.tr.Debugf("Insecure - NOT using TLS")
} }
if toDomain != "" && !s.Dinfo.OutgoingSecLevel(toDomain, secLevel) { if a.toDomain != "" && !a.courier.Dinfo.OutgoingSecLevel(a.toDomain, secLevel) {
// We consider the failure transient, so transient misconfigurations // We consider the failure transient, so transient misconfigurations
// do not affect deliveries. // do not affect deliveries.
slcResults.Add("fail", 1) slcResults.Add("fail", 1)
return tr.Errorf("Security level check failed (level:%s)", secLevel), false return a.tr.Errorf("Security level check failed (level:%s)", secLevel), false
} }
slcResults.Add("pass", 1) slcResults.Add("pass", 1)
// c.Mail will add the <> for us when the address is empty. if err = c.MailAndRcpt(a.from, a.to); err != nil {
if from == "<>" { return a.tr.Errorf("MAIL+RCPT %v", err), smtp.IsPermanent(err)
from = ""
}
if err = c.MailAndRcpt(from, to); err != nil {
return tr.Errorf("MAIL+RCPT %v", err), smtp.IsPermanent(err)
} }
w, err := c.Data() w, err := c.Data()
if err != nil { if err != nil {
return tr.Errorf("DATA %v", err), smtp.IsPermanent(err) return a.tr.Errorf("DATA %v", err), smtp.IsPermanent(err)
} }
_, err = w.Write(data) _, err = w.Write(a.data)
if err != nil { if err != nil {
return tr.Errorf("DATA writing: %v", err), smtp.IsPermanent(err) return a.tr.Errorf("DATA writing: %v", err), smtp.IsPermanent(err)
} }
err = w.Close() err = w.Close()
if err != nil { if err != nil {
return tr.Errorf("DATA closing %v", err), smtp.IsPermanent(err) return a.tr.Errorf("DATA closing %v", err), smtp.IsPermanent(err)
} }
c.Quit() c.Quit()
tr.Debugf("done") a.tr.Debugf("done")
return nil, false return nil, false
} }
func lookupMX(tr *trace.Trace, domain string) (string, error) { func lookupMXs(tr *trace.Trace, domain string) ([]string, error) {
if v, ok := fakeMX[domain]; ok { if v, ok := fakeMX[domain]; ok {
return v, nil return v, nil
} }
domain, err := idna.ToASCII(domain) domain, err := idna.ToASCII(domain)
if err != nil { if err != nil {
return "", err return nil, err
} }
mxs, err := net.LookupMX(domain) mxs := []string{}
if err == nil {
if len(mxs) == 0 { mxRecords, err := net.LookupMX(domain)
tr.Debugf("domain %q has no MX, falling back to A", domain) if err != nil {
return domain, nil // There was an error. It could be that the domain has no MX, in which
// case we have to fall back to A, or a bigger problem.
// Unfortunately, go's API doesn't let us easily distinguish between
// them. For now, if the error is permanent, we assume it's because
// there was no MX and fall back, otherwise we return.
// TODO: Find a better way to do this.
dnsErr, ok := err.(*net.DNSError)
if !ok {
tr.Debugf("MX lookup error: %v", err)
return nil, err
} else if dnsErr.Temporary() {
tr.Debugf("temporary DNS error: %v", dnsErr)
return nil, err
} }
tr.Debugf("MX %s", mxs[0].Host) // Permanent error, we assume MX does not exist and fall back to A.
return mxs[0].Host, nil tr.Debugf("failed to resolve MX for %s, falling back to A", domain)
mxs = []string{domain}
} else {
// Convert the DNS records to a plain string slice. They're already
// sorted by priority.
for _, r := range mxRecords {
mxs = append(mxs, r.Host)
}
} }
// There was an error. It could be that the domain has no MX, in which // Note that mxs could be empty; in that case we do NOT fall back to A.
// case we have to fall back to A, or a bigger problem. // This case is explicitly covered by the SMTP RFC.
// Unfortunately, go's API doesn't let us easily distinguish between them. // https://tools.ietf.org/html/rfc5321#section-5.1
// For now, if the error is permanent, we assume it's because there was no
// MX and fall back, otherwise we return. // Cap the list of MXs to 5 hosts, to keep delivery attempt times sane
// TODO: Find a better way to do this. // and prevent abuse.
dnsErr, ok := err.(*net.DNSError) if len(mxs) > 5 {
if !ok { mxs = mxs[:5]
tr.Debugf("MX lookup error: %v", err)
return "", err
} else if dnsErr.Temporary() {
tr.Debugf("temporary DNS error: %v", dnsErr)
return "", err
} }
// Permanent error, we assume MX does not exist and fall back to A. tr.Debugf("MXs: %v", mxs)
tr.Debugf("failed to resolve MX for %s, falling back to A", domain) return mxs, nil
return domain, nil
} }

View File

@@ -87,7 +87,9 @@ func TestSMTP(t *testing.T) {
addr := fakeServer(t, responses) addr := fakeServer(t, responses)
host, port, _ := net.SplitHostPort(addr) host, port, _ := net.SplitHostPort(addr)
fakeMX["to"] = host // Put a non-existing host first, so we check that if the first host
// doesn't work, we try with the rest.
fakeMX["to"] = []string{"nonexistinghost", host}
*smtpPort = port *smtpPort = port
s, tmpDir := newSMTP(t) s, tmpDir := newSMTP(t)
@@ -148,7 +150,7 @@ func TestSMTPErrors(t *testing.T) {
addr := fakeServer(t, rs) addr := fakeServer(t, rs)
host, port, _ := net.SplitHostPort(addr) host, port, _ := net.SplitHostPort(addr)
fakeMX["to"] = host fakeMX["to"] = []string{host}
*smtpPort = port *smtpPort = port
s, tmpDir := newSMTP(t) s, tmpDir := newSMTP(t)