mirror of
https://blitiri.com.ar/repos/chasquid
synced 2025-12-17 14:37:02 +00:00
normalize: Improve ToCRLF/StringToCRLF performance
The ToCRLF/StringToCRLF functions are not very performance critical, but
we call it for each mail, and the current implementation is very
inefficient (mainly because it goes one byte at a time).
This patch replaces it with a better implementation that goes line by line.
The new implementation of ToCRLF is ~40% faster, and StringToCRLF is ~60%
faster.
```
$ benchstat old.txt new.txt
goos: linux
goarch: amd64
pkg: blitiri.com.ar/go/chasquid/internal/normalize
cpu: 13th Gen Intel(R) Core(TM) i9-13900T
│ old.txt │ new.txt │
│ sec/op │ sec/op vs base │
ToCRLF-32 162.96µ ± 6% 95.42µ ± 12% -41.44% (p=0.000 n=10)
StringToCRLF-32 190.70µ ± 14% 76.51µ ± 6% -59.88% (p=0.000 n=10)
geomean 176.3µ 85.44µ -51.53%
```
This commit is contained in:
@@ -78,23 +78,63 @@ func DomainToUnicode(addr string) (string, error) {
|
|||||||
// preexisting CRLF, it leaves it be. It assumes that CR is never used on its
|
// preexisting CRLF, it leaves it be. It assumes that CR is never used on its
|
||||||
// own.
|
// own.
|
||||||
func ToCRLF(in []byte) []byte {
|
func ToCRLF(in []byte) []byte {
|
||||||
b := bytes.NewBuffer(nil)
|
b := bytes.Buffer{}
|
||||||
b.Grow(len(in))
|
b.Grow(len(in))
|
||||||
for _, c := range in {
|
|
||||||
switch c {
|
// We go line by line, but beware:
|
||||||
case '\r':
|
// Split("a\nb", "\n") -> ["a", "b"]
|
||||||
// Ignore CR, we'll add it back later. It should never appear
|
// Split("a\nb\n", "\n") -> ["a", "b", ""]
|
||||||
// alone in the contexts where this function is used.
|
// So we handle the last line separately.
|
||||||
case '\n':
|
lines := bytes.Split(in, []byte("\n"))
|
||||||
b.Write([]byte("\r\n"))
|
for i, line := range lines {
|
||||||
default:
|
b.Write(line)
|
||||||
b.WriteByte(c)
|
if i == len(lines)-1 {
|
||||||
|
// Do not add newline to the last line:
|
||||||
|
// - If the string ends with a newline, we already added it in
|
||||||
|
// the previous-to-last line, and this line is "".
|
||||||
|
// - If the string does NOT end with a newline, this preserves
|
||||||
|
// that property.
|
||||||
|
break
|
||||||
}
|
}
|
||||||
|
if !bytes.HasSuffix(line, []byte("\r")) {
|
||||||
|
// Missing the CR.
|
||||||
|
b.WriteByte('\r')
|
||||||
|
}
|
||||||
|
b.WriteByte('\n')
|
||||||
}
|
}
|
||||||
|
|
||||||
return b.Bytes()
|
return b.Bytes()
|
||||||
}
|
}
|
||||||
|
|
||||||
// StringToCRLF is like ToCRLF, but operates on strings.
|
// StringToCRLF is like ToCRLF, but operates on strings.
|
||||||
func StringToCRLF(in string) string {
|
func StringToCRLF(in string) string {
|
||||||
return string(ToCRLF([]byte(in)))
|
// We implement it the same way as ToCRLF, but with string versions.
|
||||||
|
// This is significantly faster than converting the string to a byte
|
||||||
|
// slice, calling ToCRLF, and converting it back.
|
||||||
|
b := strings.Builder{}
|
||||||
|
b.Grow(len(in))
|
||||||
|
|
||||||
|
// We go line by line, but beware:
|
||||||
|
// Split("a\nb", "\n") -> ["a", "b"]
|
||||||
|
// Split("a\nb\n", "\n") -> ["a", "b", ""]
|
||||||
|
// So we handle the last line separately.
|
||||||
|
lines := strings.Split(in, "\n")
|
||||||
|
for i, line := range lines {
|
||||||
|
b.WriteString(line)
|
||||||
|
if i == len(lines)-1 {
|
||||||
|
// Do not add newline to the last line:
|
||||||
|
// - If the string ends with a newline, we already added it in
|
||||||
|
// the previous-to-last line, and this line is "".
|
||||||
|
// - If the string does NOT end with a newline, this preserves
|
||||||
|
// that property.
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if !strings.HasSuffix(line, "\r") {
|
||||||
|
// Missing the CR.
|
||||||
|
b.WriteByte('\r')
|
||||||
|
}
|
||||||
|
b.WriteByte('\n')
|
||||||
|
}
|
||||||
|
|
||||||
|
return b.String()
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,10 @@
|
|||||||
package normalize
|
package normalize
|
||||||
|
|
||||||
import "testing"
|
import (
|
||||||
|
"bytes"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
func TestUser(t *testing.T) {
|
func TestUser(t *testing.T) {
|
||||||
valid := []struct{ user, norm string }{
|
valid := []struct{ user, norm string }{
|
||||||
@@ -134,8 +138,19 @@ func TestToCRLF(t *testing.T) {
|
|||||||
in, out string
|
in, out string
|
||||||
}{
|
}{
|
||||||
{"", ""},
|
{"", ""},
|
||||||
|
{"a", "a"},
|
||||||
|
|
||||||
|
// Does not end in newline.
|
||||||
|
{"a\n", "a\r\n"},
|
||||||
{"a\nb", "a\r\nb"},
|
{"a\nb", "a\r\nb"},
|
||||||
{"a\r\nb", "a\r\nb"},
|
{"a\r\nb", "a\r\nb"},
|
||||||
|
|
||||||
|
// Ends in newline.
|
||||||
|
{"a\nb\n", "a\r\nb\r\n"},
|
||||||
|
{"a\r\nb\n", "a\r\nb\r\n"},
|
||||||
|
{"a\r\nb\r\n", "a\r\nb\r\n"},
|
||||||
|
{"a\r\nb\n\n", "a\r\nb\r\n\r\n"},
|
||||||
|
{"a\r\nb\r\n\r\n", "a\r\nb\r\n\r\n"},
|
||||||
}
|
}
|
||||||
for _, c := range cases {
|
for _, c := range cases {
|
||||||
got := string(ToCRLF([]byte(c.in)))
|
got := string(ToCRLF([]byte(c.in)))
|
||||||
@@ -173,3 +188,31 @@ func FuzzDomainToUnicode(f *testing.F) {
|
|||||||
DomainToUnicode(addr)
|
DomainToUnicode(addr)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func BenchmarkToCRLF(b *testing.B) {
|
||||||
|
// Generate a 1000-line message.
|
||||||
|
bb := bytes.Buffer{}
|
||||||
|
for i := 0; i < 1000; i++ {
|
||||||
|
bb.WriteString("this is a very pretty line 🐅\n")
|
||||||
|
}
|
||||||
|
buf := bb.Bytes()
|
||||||
|
|
||||||
|
b.ResetTimer()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
ToCRLF(buf)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkStringToCRLF(b *testing.B) {
|
||||||
|
// Generate a 1000-line message.
|
||||||
|
sb := strings.Builder{}
|
||||||
|
for i := 0; i < 1000; i++ {
|
||||||
|
sb.WriteString("this is a very pretty line 🐅\n")
|
||||||
|
}
|
||||||
|
s := sb.String()
|
||||||
|
|
||||||
|
b.ResetTimer()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
StringToCRLF(s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user