From 41bb7b6f5eb52f18b9fd5d4270234135ebd4ab96 Mon Sep 17 00:00:00 2001 From: Alberto Bertogli Date: Thu, 31 Oct 2024 13:05:57 +0000 Subject: [PATCH] normalize: Improve ToCRLF/StringToCRLF performance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ToCRLF/StringToCRLF functions are not very performance critical, but we call it for each mail, and the current implementation is very inefficient (mainly because it goes one byte at a time). This patch replaces it with a better implementation that goes line by line. The new implementation of ToCRLF is ~40% faster, and StringToCRLF is ~60% faster. ``` $ benchstat old.txt new.txt goos: linux goarch: amd64 pkg: blitiri.com.ar/go/chasquid/internal/normalize cpu: 13th Gen Intel(R) Core(TM) i9-13900T │ old.txt │ new.txt │ │ sec/op │ sec/op vs base │ ToCRLF-32 162.96µ ± 6% 95.42µ ± 12% -41.44% (p=0.000 n=10) StringToCRLF-32 190.70µ ± 14% 76.51µ ± 6% -59.88% (p=0.000 n=10) geomean 176.3µ 85.44µ -51.53% ``` --- internal/normalize/normalize.go | 62 +++++++++++++++++++++++----- internal/normalize/normalize_test.go | 45 +++++++++++++++++++- 2 files changed, 95 insertions(+), 12 deletions(-) diff --git a/internal/normalize/normalize.go b/internal/normalize/normalize.go index 17768a5..6fd6f6a 100644 --- a/internal/normalize/normalize.go +++ b/internal/normalize/normalize.go @@ -78,23 +78,63 @@ func DomainToUnicode(addr string) (string, error) { // preexisting CRLF, it leaves it be. It assumes that CR is never used on its // own. func ToCRLF(in []byte) []byte { - b := bytes.NewBuffer(nil) + b := bytes.Buffer{} b.Grow(len(in)) - for _, c := range in { - switch c { - case '\r': - // Ignore CR, we'll add it back later. It should never appear - // alone in the contexts where this function is used. - case '\n': - b.Write([]byte("\r\n")) - default: - b.WriteByte(c) + + // We go line by line, but beware: + // Split("a\nb", "\n") -> ["a", "b"] + // Split("a\nb\n", "\n") -> ["a", "b", ""] + // So we handle the last line separately. + lines := bytes.Split(in, []byte("\n")) + for i, line := range lines { + b.Write(line) + if i == len(lines)-1 { + // Do not add newline to the last line: + // - If the string ends with a newline, we already added it in + // the previous-to-last line, and this line is "". + // - If the string does NOT end with a newline, this preserves + // that property. + break } + if !bytes.HasSuffix(line, []byte("\r")) { + // Missing the CR. + b.WriteByte('\r') + } + b.WriteByte('\n') } + return b.Bytes() } // StringToCRLF is like ToCRLF, but operates on strings. func StringToCRLF(in string) string { - return string(ToCRLF([]byte(in))) + // We implement it the same way as ToCRLF, but with string versions. + // This is significantly faster than converting the string to a byte + // slice, calling ToCRLF, and converting it back. + b := strings.Builder{} + b.Grow(len(in)) + + // We go line by line, but beware: + // Split("a\nb", "\n") -> ["a", "b"] + // Split("a\nb\n", "\n") -> ["a", "b", ""] + // So we handle the last line separately. + lines := strings.Split(in, "\n") + for i, line := range lines { + b.WriteString(line) + if i == len(lines)-1 { + // Do not add newline to the last line: + // - If the string ends with a newline, we already added it in + // the previous-to-last line, and this line is "". + // - If the string does NOT end with a newline, this preserves + // that property. + break + } + if !strings.HasSuffix(line, "\r") { + // Missing the CR. + b.WriteByte('\r') + } + b.WriteByte('\n') + } + + return b.String() } diff --git a/internal/normalize/normalize_test.go b/internal/normalize/normalize_test.go index e5c7829..66bd324 100644 --- a/internal/normalize/normalize_test.go +++ b/internal/normalize/normalize_test.go @@ -1,6 +1,10 @@ package normalize -import "testing" +import ( + "bytes" + "strings" + "testing" +) func TestUser(t *testing.T) { valid := []struct{ user, norm string }{ @@ -134,8 +138,19 @@ func TestToCRLF(t *testing.T) { in, out string }{ {"", ""}, + {"a", "a"}, + + // Does not end in newline. + {"a\n", "a\r\n"}, {"a\nb", "a\r\nb"}, {"a\r\nb", "a\r\nb"}, + + // Ends in newline. + {"a\nb\n", "a\r\nb\r\n"}, + {"a\r\nb\n", "a\r\nb\r\n"}, + {"a\r\nb\r\n", "a\r\nb\r\n"}, + {"a\r\nb\n\n", "a\r\nb\r\n\r\n"}, + {"a\r\nb\r\n\r\n", "a\r\nb\r\n\r\n"}, } for _, c := range cases { got := string(ToCRLF([]byte(c.in))) @@ -173,3 +188,31 @@ func FuzzDomainToUnicode(f *testing.F) { DomainToUnicode(addr) }) } + +func BenchmarkToCRLF(b *testing.B) { + // Generate a 1000-line message. + bb := bytes.Buffer{} + for i := 0; i < 1000; i++ { + bb.WriteString("this is a very pretty line 🐅\n") + } + buf := bb.Bytes() + + b.ResetTimer() + for i := 0; i < b.N; i++ { + ToCRLF(buf) + } +} + +func BenchmarkStringToCRLF(b *testing.B) { + // Generate a 1000-line message. + sb := strings.Builder{} + for i := 0; i < 1000; i++ { + sb.WriteString("this is a very pretty line 🐅\n") + } + s := sb.String() + + b.ResetTimer() + for i := 0; i < b.N; i++ { + StringToCRLF(s) + } +}