encoder/filename: Add SCSU as tables

Instead of only adding SCSU, add it as an existing table.

Allow direct SCSU and add a, perhaps, reasonable table as well.

Add byte interfaces that doesn't base64 encode the URL as well with `EncodeBytes` and `DecodeBytes`.

Fuzz tested and decode tests added.
This commit is contained in:
Klaus Post 2021-01-04 17:09:09 +01:00 committed by Nick Craig-Wood
parent 47b69d6300
commit 424aaac2e1
8 changed files with 201 additions and 24 deletions

1
go.mod
View file

@ -21,6 +21,7 @@ require (
github.com/calebcase/tmpfile v1.0.2 // indirect
github.com/colinmarc/hdfs/v2 v2.2.0
github.com/coreos/go-semver v0.3.0
github.com/dop251/scsu v0.0.0-20200422003335-8fadfb689669
github.com/dropbox/dropbox-sdk-go-unofficial v5.6.0+incompatible
github.com/gabriel-vasile/mimetype v1.1.2
github.com/gogo/protobuf v1.3.2 // indirect

3
go.sum
View file

@ -171,11 +171,14 @@ github.com/dgrijalva/jwt-go v3.2.0+incompatible h1:7qlOGliEKZXTDg6OTjfoBKDXWrumC
github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ=
github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954/go.mod h1:vAd38F8PWV+bWy6jNmig1y/TA+kYO4g3RSRF0IAv0no=
github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815/go.mod h1:WwZ+bS3ebgob9U8Nd0kOddGdZWjyMGR8Wziv+TBNwSE=
github.com/dop251/scsu v0.0.0-20200422003335-8fadfb689669 h1:e28M2/odOZjMc1J2ZZwgex6NM9+aqr1nMlTqPLayxbk=
github.com/dop251/scsu v0.0.0-20200422003335-8fadfb689669/go.mod h1:Gth7Xev0h28tuTayG4HlTZy90IXhiDgV2+MLtJzjpP0=
github.com/dropbox/dropbox-sdk-go-unofficial v5.6.0+incompatible h1:DtumzkLk2zZ2SeElEr+VNz+zV7l+BTe509cV4sKPXbM=
github.com/dropbox/dropbox-sdk-go-unofficial v5.6.0+incompatible/go.mod h1:lr+LhMM3F6Y3lW1T9j2U5l7QeuWm87N9+PPXo3yH4qY=
github.com/dustin/go-humanize v0.0.0-20171111073723-bb3d318650d4/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
github.com/dustin/go-humanize v0.0.0-20180421182945-02af3965c54e/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
github.com/dvyukov/go-fuzz v0.0.0-20200318091601-be3528f3a813 h1:NgO45/5mBLRVfiXerEFzH6ikcZ7DNRPS639xFg3ENzU=
github.com/dvyukov/go-fuzz v0.0.0-20200318091601-be3528f3a813/go.mod h1:11Gm+ccJnvAhCNLlf5+cS9KjtbaD5I5zaZpFMsTHWTw=
github.com/eapache/go-resiliency v1.1.0/go.mod h1:kFI+JgMyC7bLPUVY133qvEBtVayf5mFgVsvEsIPBvNs=
github.com/eapache/go-xerial-snappy v0.0.0-20180814174437-776d5712da21/go.mod h1:+020luEh2TKB4/GOp8oxxtq0Daoen/Cii55CzbTV6DU=

View file

@ -7,6 +7,7 @@ import (
"errors"
"sync"
"github.com/dop251/scsu"
"github.com/klauspost/compress/huff0"
)
@ -22,6 +23,7 @@ var customDecMu sync.Mutex
// Decode an encoded string.
func Decode(s string) (string, error) {
initCoders()
if len(s) < 1 {
return "", ErrCorrupted
}
@ -31,19 +33,25 @@ func Decode(s string) (string, error) {
}
table--
s = s[1:]
data := make([]byte, base64.URLEncoding.DecodedLen(len(s)))
n, err := base64.URLEncoding.Decode(data, ([]byte)(s))
if err != nil || n < 0 {
return "", ErrCorrupted
}
data = data[:n]
return DecodeBytes(table, data)
}
// DecodeBytes will decode raw id and data values.
func DecodeBytes(table byte, data []byte) (string, error) {
initCoders()
switch table {
case tableUncompressed:
return string(data), nil
case tableReserved:
return "", ErrUnsupported
case tableSCSUPlain:
return scsu.Decode(data)
case tableRLE:
if len(data) < 2 {
return "", ErrCorrupted
@ -79,6 +87,9 @@ func Decode(s string) (string, error) {
if err != nil {
return "", ErrCorrupted
}
if table == tableSCSU {
return scsu.Decode(name)
}
return string(name), nil
}
}

View file

@ -0,0 +1,92 @@
package filename
import "testing"
func TestDecode(t *testing.T) {
tests := []struct {
name string
encoded string
want string
wantErr bool
}{
{
name: "unicode-1",
encoded: "8D5V3MESVd-WEF7WuqaOvpKUWtYGEyw5UDQ==",
want: "長い長いUNICODEファイル名",
wantErr: false,
},
{
name: "unicode-2",
encoded: "8GyHV1N7u2OEg4ufQ3eHQ3Ngg6N3X0CDg4-HX0NXU2tg=",
want: "ვეპხის ტყაოსანი შოთა რუსთაველი",
wantErr: false,
},
{
name: "unicode-3",
encoded: "7LpehMXOrWe7mcT_lpf2MN1Nmgu55jpXHLavZcXJb2UTJ-UmGU15iznkD",
want: "Sønderjysk: Æ ka æe glass uhen at det go mæ naue.,",
wantErr: false,
},
{
name: "unicode-4",
encoded: "7TCSRm0liJDR0ulpBq4Lla_XB2mWdLFMEs8wEQKHAGa8FRr333ntJ6Ww6_f__N5VKeYM=",
want: "Hello------world 時危兵甲滿天涯,載道流離起怨咨.bin",
},
{
name: "plain-1",
encoded: "BzGQYxqHBA6ljTsir80gUM5Y=",
want: "-Duplican99E8ZI4___9_",
wantErr: false,
},
{
name: "hex-1",
encoded: "D_--tHZROQpqqJ9PafqNa6STF",
want: "13646871dfabbs43323564654bbefff",
wantErr: false,
},
{
name: "base64-1",
encoded: "FMpABB9Ef0KP8OrVxjnE3LzUePuLZi8pPg7eW8bgyW2d3Ucckf4rlE0mkAvlILVpOmF3L-rFbmNrpUO2HQFlF4SCMPVPeCEX6LeOg5JVpUVCXV1WSazD9vSpr",
want: "UxAYiB0FNTTkXRw9P8hwq-WmN7tYwbe-sFw8C3snDRG1d-yjrdOUVZQyLdtkJ8tuvhBSnuBiLjVieCAroWEZDIO4Hb_rKgdzPjMqFE7inwHJ2isF==",
wantErr: false,
},
{
name: "custom-1",
encoded: "-BeADJCoG_________________xc=",
want: "Uaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
wantErr: false,
},
{
name: "rle-1",
encoded: "9a2E=",
want: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
wantErr: false,
},
{
name: "regular-1",
encoded: "BeSSrnzj0j3OXyR9K81M=",
want: "regular-filename.txt",
wantErr: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := Decode(tt.encoded)
if (err != nil) != tt.wantErr {
if tt.encoded == "" && tt.want != "" {
proposed := Encode(tt.want)
table := decodeMap[proposed[0]] - 1
t.Errorf("No encoded value, try '%s', table is %d", proposed, table)
return
}
t.Errorf("Decode() error = %v, wantErr %v", err, tt.wantErr)
return
}
if got != tt.want {
t.Errorf("Decode() got = %v, want %v", got, tt.want)
}
})
}
}

View file

@ -4,6 +4,7 @@ import (
"encoding/base64"
"encoding/binary"
"github.com/dop251/scsu"
"github.com/klauspost/compress/huff0"
)
@ -11,21 +12,45 @@ import (
// Calling Decode with the returned string should always succeed.
// It is not a requirement that the input string is valid utf-8.
func Encode(s string) string {
table, payload := EncodeBytes(s)
return string(encodeURL[table]) + base64.URLEncoding.EncodeToString(payload)
}
// EncodeBytes will compress the given string and return a table identifier and a payload.
func EncodeBytes(s string) (table byte, payload []byte) {
initCoders()
bestSize := len(s)
bestTable := tableUncompressed
bestTable := byte(tableUncompressed)
org := []byte(s)
bestOut := []byte(s)
// Try all tables and choose the best
for i, enc := range encTables[:] {
org := org
if len(org) <= 1 || len(org) > maxLength {
// Use the uncompressed
break
}
if enc == nil {
continue
}
if i == tableSCSU {
var err error
olen := len(org)
org, err = scsu.EncodeStrict(s, make([]byte, 0, len(org)))
if err != nil || olen <= len(org) {
continue
}
if len(org) < bestSize {
// This is already better, store so we can use if the table cannot.
bestOut = bestOut[:len(org)]
bestTable = tableSCSUPlain
bestSize = len(org)
copy(bestOut, org)
}
}
// Try to encode using table.
err := func() error {
encTableLocks[i].Lock()
@ -36,14 +61,14 @@ func Encode(s string) string {
}
if len(out) < bestSize {
bestOut = bestOut[:len(out)]
bestTable = i
bestTable = byte(i)
bestSize = len(out)
copy(bestOut, out)
}
return nil
}()
// If input is a single byte repeated store as RLE or save uncompressed.
if err == huff0.ErrUseRLE {
if err == huff0.ErrUseRLE && i != tableSCSU {
if len(org) > 2 {
// Encode as one byte repeated since it will be smaller than uncompressed.
n := binary.PutUvarint(bestOut, uint64(len(org)))
@ -56,5 +81,5 @@ func Encode(s string) string {
}
}
return string(encodeURL[bestTable]) + base64.URLEncoding.EncodeToString(bestOut)
return bestTable, bestOut
}

View file

@ -25,7 +25,9 @@ func Fuzz(data []byte) int {
panic(fmt.Sprintf("error decoding %q, input %q: %v", enc, string(data), err))
}
if !bytes.Equal(data, []byte(decoded)) {
panic(fmt.Sprintf("decode mismatch, encoded: %q, org: %q, got: %q", enc, string(data), decoded))
table := decodeMap[enc[0]]
table--
panic(fmt.Sprintf("decode mismatch, encoded: %q, org: %q, got: %q, table %d", enc, string(data), decoded, int(table)))
}
// Everything is good.

View file

@ -3,21 +3,33 @@
package main
import (
"bufio"
"bytes"
"encoding/base64"
"flag"
"fmt"
"io/ioutil"
"math"
"strings"
"unicode/utf8"
"github.com/dop251/scsu"
"github.com/klauspost/compress"
"github.com/klauspost/compress/huff0"
)
// Replace/add histogram data and execute go run gentable.go
// execute go run gentable.go
var indexFile = flag.String("index", "", "Index this file for table")
// Allow non-represented characters.
var addUnused = flag.Bool("all", true, "Make all bytes possible")
var scsuEncode = flag.Bool("scsu", false, "SCSU encode on each line before table")
func main() {
// Allow non-represented characters.
const omitUnused = false
flag.Parse()
histogram := [256]uint64{
// Replace/add histogram data and execute go run gentable.go
// ncw home directory
//0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 19442, 760, 0, 349, 570, 1520, 199, 76, 685, 654, 0, 40377, 1605, 395132, 935270, 0, 1156377, 887730, 811737, 712241, 693240, 689139, 675964, 656417, 666577, 657413, 532, 24, 0, 145, 0, 3, 946, 44932, 37362, 46126, 36752, 76346, 19338, 47457, 14288, 38163, 4350, 7867, 36541, 65011, 30255, 26792, 22097, 1803, 39191, 61965, 76585, 11887, 12896, 5931, 1935, 1731, 1385, 1279, 9, 1278, 1, 420185, 0, 1146359, 746359, 968896, 868703, 1393640, 745019, 354147, 159462, 483979, 169092, 75937, 385858, 322166, 466635, 571268, 447132, 13792, 446484, 736844, 732675, 170232, 112983, 63184, 142357, 173945, 21521, 250, 0, 250, 4140, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 39, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 15, 0, 0, 0, 10, 0, 5, 0, 0, 0, 0, 0, 0, 283, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
//Images:
@ -26,16 +38,40 @@ func main() {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 459, 0, 0, 7, 0, 0, 0, 7, 1, 1, 0, 2, 1, 506, 706, 0, 3903, 3552, 3694, 3338, 3262, 3257, 3222, 3249, 3325, 3261, 5, 0, 0, 1, 0, 0, 0, 48, 31, 61, 53, 46, 17, 17, 34, 32, 9, 22, 17, 31, 27, 19, 52, 5, 46, 84, 38, 14, 5, 19, 2, 2, 0, 8, 0, 8, 0, 180, 0, 5847, 3282, 3729, 3695, 3842, 3356, 316, 139, 487, 117, 95, 476, 289, 428, 609, 467, 5, 446, 592, 955, 130, 112, 57, 390, 168, 14, 0, 2, 0, 44, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
}
// Override with equally distributed characters
if false {
histogram = [256]uint64{}
var chars string
// base c64
chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
// hex
//chars = "0123456789abcdef"
for _, v := range []byte(chars) {
histogram[v] = 1
if *indexFile != "" {
for i := range histogram[:] {
histogram[i] = 0
}
b, err := ioutil.ReadFile(*indexFile)
if err != nil {
panic(err)
}
if *scsuEncode {
br := bufio.NewReader(bytes.NewBuffer(b))
var encoded []byte
for {
line, err := br.ReadString('\n')
if err != nil {
break
}
line = strings.TrimSpace(line)
if len(line) < 3 || !utf8.ValidString(line) {
continue
}
e, err := scsu.Encode(line, nil)
if err != nil {
panic(err)
}
if len(e) >= len([]byte(line)) {
continue
}
encoded = append(encoded, e...)
}
fmt.Println("scsu", len(b), "->", len(encoded), "(excluding bigger)")
b = encoded
}
for _, v := range b {
histogram[v]++
}
}
@ -49,7 +85,7 @@ func main() {
const scale = 100 << 10
var tmp []byte
for i, v := range histogram[:] {
if v == 0 && omitUnused {
if v == 0 && !*addUnused {
continue
}
nf := float64(v) / float64(total) * scale

View file

@ -26,9 +26,12 @@ var (
const (
tableUncompressed = 0
tableRLE = 61
tableCustom = 62
tableReserved = 63
tableSCSU = 59
tableSCSUPlain = 60
tableRLE = 61
tableCustom = 62
tableReserved = 63
)
// predefined tables as base64 URL encoded string.
@ -47,6 +50,10 @@ var tablesData = [64]string{
5: "JRDIcQf_______8PgIiIiIgINkggARHlkQwSSCCBxHFYINHdfXI=",
// Special tables:
// SCSU and a fairly generic table:
tableSCSU: "UxAgZmEB-RYPU8hrnAk6uMgpTNQMB5MGRBx0D3T0JjyUyY-yOi5CoGgktbAktSh7d36HtPTFu7SXJ7FYw_AYmA74ZH2vWgc8O6Z5jLnWnsFqU_4B",
// SCSU with no table...
tableSCSUPlain: "",
// Compressed data has its own table.
tableCustom: "",
// Reserved for extension.