Browse Source

Fix chopping up multi-byte Unicode characters (#21)

UTF-8 encoded multi-byte codepoints might have been split in half producing invalid Unicode when truncating to a maximum length. Now we
count runes rather than bytes. This has the side effect, that feeds with multi-byte Unicode characters get larger. But oh well, it should be
actually even better to read.

-----

Unit tests pass. Didn't actually test more than that. 8-)

Co-authored-by: Lysander Trischler <software@lyse.isobeef.org>
Reviewed-on: #21
Co-authored-by: lyse <lyse@noreply@mills.io>
Co-committed-by: lyse <lyse@noreply@mills.io>
pull/23/head
lyse 6 months ago committed by James Mills
parent
commit
12c271ea14
  1. 1
      .gitignore
  2. 7
      feeds.go
  3. 77
      feeds_test.go

1
.gitignore vendored

@ -2,6 +2,7 @@
*.bak
*.txt
.DS_Store
*.sw?
/dist
/feeds

7
feeds.go

@ -49,8 +49,9 @@ func ProcessFeedContent(title, desc string, max int) string {
return fmt.Sprintf("%s: %s", title, err)
}
markdown = CleanTwt(fmt.Sprintf("**%s**\n%s", title, markdown))
if len(markdown) > max {
return fmt.Sprintf("%s ...", markdown[:max])
markdownRunes := []rune(markdown)
if len(markdownRunes) > max {
return fmt.Sprintf("%s ...", string(markdownRunes[:max]))
}
return markdown
}
@ -266,7 +267,7 @@ func UpdateTwitterFeed(conf *Config, name, handle string) error {
ResizeW: avatarResolution,
ResizeH: avatarResolution,
}
profile, err := twitterscraper.GetProfile(handle)
if err != nil {
log.WithError(err).Warnf("error retrieving twitter profile for %s", handle)

77
feeds_test.go

@ -0,0 +1,77 @@
package main
import (
"fmt"
"testing"
)
func TestProcessFeedContent(t *testing.T) {
for _, testCase := range []struct {
name string
title string
body string
max int
result string
}{
{
name: "when max rune length is greater than content length then return whole content",
title: "Some Test",
body: "Oh yeah, this is cool!",
max: 37,
result: "**Some Test**\u2028Oh yeah, this is cool!",
},
{
name: "when max rune length is exactly the content length then return whole content",
title: "Some Test",
body: "Oh yeah, this is cool!",
max: 36,
result: "**Some Test**\u2028Oh yeah, this is cool!",
},
{
name: "when max rune length is smaller than content length then return truncated content",
title: "Some Test",
body: "Oh yeah, this is cool!",
max: 35,
result: "**Some Test**\u2028Oh yeah, this is cool ...",
},
{
name: "when max rune length is smaller than content length and content is multi-byte Unicode then return truncated content with multi-byte characters intact",
title: "Äöüß", // all these umlauts are two bytes long encoded in UTF-8 and would be chopped in half when counting bytes instead of runes
body: "truncated anyways",
max: 5,
result: "**Äöü ...",
},
} {
t.Run(testCase.name, func(t *testing.T) {
actual := ProcessFeedContent(testCase.title, testCase.body, testCase.max)
if testCase.result != actual {
expectedRunes := []rune(testCase.result)
actualRunes := []rune(actual)
hint := func() string {
for i, expectedRune := range expectedRunes {
if i >= len(actualRunes) {
return fmt.Sprintf("actual too short, ends at rune index %d", i)
}
actualRune := actualRunes[i]
if expectedRune != actualRune {
return fmt.Sprintf("first difference at rune index %d:\n"+
"expected: '%s' (%U)\n"+
"actual: '%s' (%U)",
i, string(expectedRune), expectedRune,
string(actualRune), actualRune)
}
}
return fmt.Sprintf("actual too long, expected ends at rune index %d", len(expectedRunes))
}()
t.Logf("markdown not equal\n"+
"expected: '%s' (byte length %d, rune length %d)\n"+
"actual: '%s' (byte length %d, rune length %d)\n"+hint,
testCase.result, len(testCase.result), len(expectedRunes),
actual, len(actual), len(actualRunes))
t.Fail()
}
})
}
}
Loading…
Cancel
Save