encoder/filename: Add SCSU as tables
Instead of only adding SCSU, add it as an existing table. Allow direct SCSU and add a, perhaps, reasonable table as well. Add byte interfaces that doesn't base64 encode the URL as well with `EncodeBytes` and `DecodeBytes`. Fuzz tested and decode tests added.
This commit is contained in:
parent
47b69d6300
commit
424aaac2e1
8 changed files with 201 additions and 24 deletions
1
go.mod
1
go.mod
|
@ -21,6 +21,7 @@ require (
|
|||
github.com/calebcase/tmpfile v1.0.2 // indirect
|
||||
github.com/colinmarc/hdfs/v2 v2.2.0
|
||||
github.com/coreos/go-semver v0.3.0
|
||||
github.com/dop251/scsu v0.0.0-20200422003335-8fadfb689669
|
||||
github.com/dropbox/dropbox-sdk-go-unofficial v5.6.0+incompatible
|
||||
github.com/gabriel-vasile/mimetype v1.1.2
|
||||
github.com/gogo/protobuf v1.3.2 // indirect
|
||||
|
|
3
go.sum
3
go.sum
|
@ -171,11 +171,14 @@ github.com/dgrijalva/jwt-go v3.2.0+incompatible h1:7qlOGliEKZXTDg6OTjfoBKDXWrumC
|
|||
github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ=
|
||||
github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954/go.mod h1:vAd38F8PWV+bWy6jNmig1y/TA+kYO4g3RSRF0IAv0no=
|
||||
github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815/go.mod h1:WwZ+bS3ebgob9U8Nd0kOddGdZWjyMGR8Wziv+TBNwSE=
|
||||
github.com/dop251/scsu v0.0.0-20200422003335-8fadfb689669 h1:e28M2/odOZjMc1J2ZZwgex6NM9+aqr1nMlTqPLayxbk=
|
||||
github.com/dop251/scsu v0.0.0-20200422003335-8fadfb689669/go.mod h1:Gth7Xev0h28tuTayG4HlTZy90IXhiDgV2+MLtJzjpP0=
|
||||
github.com/dropbox/dropbox-sdk-go-unofficial v5.6.0+incompatible h1:DtumzkLk2zZ2SeElEr+VNz+zV7l+BTe509cV4sKPXbM=
|
||||
github.com/dropbox/dropbox-sdk-go-unofficial v5.6.0+incompatible/go.mod h1:lr+LhMM3F6Y3lW1T9j2U5l7QeuWm87N9+PPXo3yH4qY=
|
||||
github.com/dustin/go-humanize v0.0.0-20171111073723-bb3d318650d4/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
|
||||
github.com/dustin/go-humanize v0.0.0-20180421182945-02af3965c54e/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
|
||||
github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
|
||||
github.com/dvyukov/go-fuzz v0.0.0-20200318091601-be3528f3a813 h1:NgO45/5mBLRVfiXerEFzH6ikcZ7DNRPS639xFg3ENzU=
|
||||
github.com/dvyukov/go-fuzz v0.0.0-20200318091601-be3528f3a813/go.mod h1:11Gm+ccJnvAhCNLlf5+cS9KjtbaD5I5zaZpFMsTHWTw=
|
||||
github.com/eapache/go-resiliency v1.1.0/go.mod h1:kFI+JgMyC7bLPUVY133qvEBtVayf5mFgVsvEsIPBvNs=
|
||||
github.com/eapache/go-xerial-snappy v0.0.0-20180814174437-776d5712da21/go.mod h1:+020luEh2TKB4/GOp8oxxtq0Daoen/Cii55CzbTV6DU=
|
||||
|
|
|
@ -7,6 +7,7 @@ import (
|
|||
"errors"
|
||||
"sync"
|
||||
|
||||
"github.com/dop251/scsu"
|
||||
"github.com/klauspost/compress/huff0"
|
||||
)
|
||||
|
||||
|
@ -22,6 +23,7 @@ var customDecMu sync.Mutex
|
|||
|
||||
// Decode an encoded string.
|
||||
func Decode(s string) (string, error) {
|
||||
initCoders()
|
||||
if len(s) < 1 {
|
||||
return "", ErrCorrupted
|
||||
}
|
||||
|
@ -31,19 +33,25 @@ func Decode(s string) (string, error) {
|
|||
}
|
||||
table--
|
||||
s = s[1:]
|
||||
|
||||
data := make([]byte, base64.URLEncoding.DecodedLen(len(s)))
|
||||
n, err := base64.URLEncoding.Decode(data, ([]byte)(s))
|
||||
if err != nil || n < 0 {
|
||||
return "", ErrCorrupted
|
||||
}
|
||||
data = data[:n]
|
||||
return DecodeBytes(table, data)
|
||||
}
|
||||
|
||||
// DecodeBytes will decode raw id and data values.
|
||||
func DecodeBytes(table byte, data []byte) (string, error) {
|
||||
initCoders()
|
||||
switch table {
|
||||
case tableUncompressed:
|
||||
return string(data), nil
|
||||
case tableReserved:
|
||||
return "", ErrUnsupported
|
||||
case tableSCSUPlain:
|
||||
return scsu.Decode(data)
|
||||
case tableRLE:
|
||||
if len(data) < 2 {
|
||||
return "", ErrCorrupted
|
||||
|
@ -79,6 +87,9 @@ func Decode(s string) (string, error) {
|
|||
if err != nil {
|
||||
return "", ErrCorrupted
|
||||
}
|
||||
if table == tableSCSU {
|
||||
return scsu.Decode(name)
|
||||
}
|
||||
return string(name), nil
|
||||
}
|
||||
}
|
||||
|
|
92
lib/encoder/filename/decode_test.go
Normal file
92
lib/encoder/filename/decode_test.go
Normal file
|
@ -0,0 +1,92 @@
|
|||
package filename
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestDecode(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
encoded string
|
||||
want string
|
||||
wantErr bool
|
||||
}{
|
||||
{
|
||||
name: "unicode-1",
|
||||
encoded: "8D5V3MESVd-WEF7WuqaOvpKUWtYGEyw5UDQ==",
|
||||
want: "長い長いUNICODEファイル名",
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "unicode-2",
|
||||
encoded: "8GyHV1N7u2OEg4ufQ3eHQ3Ngg6N3X0CDg4-HX0NXU2tg=",
|
||||
want: "ვეპხის ტყაოსანი შოთა რუსთაველი",
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "unicode-3",
|
||||
encoded: "7LpehMXOrWe7mcT_lpf2MN1Nmgu55jpXHLavZcXJb2UTJ-UmGU15iznkD",
|
||||
want: "Sønderjysk: Æ ka æe glass uhen at det go mæ naue.,",
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "unicode-4",
|
||||
encoded: "7TCSRm0liJDR0ulpBq4Lla_XB2mWdLFMEs8wEQKHAGa8FRr333ntJ6Ww6_f__N5VKeYM=",
|
||||
want: "Hello------world 時危兵甲滿天涯,載道流離起怨咨.bin",
|
||||
},
|
||||
{
|
||||
name: "plain-1",
|
||||
encoded: "BzGQYxqHBA6ljTsir80gUM5Y=",
|
||||
want: "-Duplican99E8ZI4___9_",
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "hex-1",
|
||||
encoded: "D_--tHZROQpqqJ9PafqNa6STF",
|
||||
want: "13646871dfabbs43323564654bbefff",
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "base64-1",
|
||||
encoded: "FMpABB9Ef0KP8OrVxjnE3LzUePuLZi8pPg7eW8bgyW2d3Ucckf4rlE0mkAvlILVpOmF3L-rFbmNrpUO2HQFlF4SCMPVPeCEX6LeOg5JVpUVCXV1WSazD9vSpr",
|
||||
want: "UxAYiB0FNTTkXRw9P8hwq-WmN7tYwbe-sFw8C3snDRG1d-yjrdOUVZQyLdtkJ8tuvhBSnuBiLjVieCAroWEZDIO4Hb_rKgdzPjMqFE7inwHJ2isF==",
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "custom-1",
|
||||
encoded: "-BeADJCoG_________________xc=",
|
||||
want: "Uaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "rle-1",
|
||||
encoded: "9a2E=",
|
||||
want: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "regular-1",
|
||||
encoded: "BeSSrnzj0j3OXyR9K81M=",
|
||||
want: "regular-filename.txt",
|
||||
wantErr: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got, err := Decode(tt.encoded)
|
||||
if (err != nil) != tt.wantErr {
|
||||
if tt.encoded == "" && tt.want != "" {
|
||||
proposed := Encode(tt.want)
|
||||
table := decodeMap[proposed[0]] - 1
|
||||
t.Errorf("No encoded value, try '%s', table is %d", proposed, table)
|
||||
return
|
||||
}
|
||||
t.Errorf("Decode() error = %v, wantErr %v", err, tt.wantErr)
|
||||
return
|
||||
}
|
||||
|
||||
if got != tt.want {
|
||||
t.Errorf("Decode() got = %v, want %v", got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
|
@ -4,6 +4,7 @@ import (
|
|||
"encoding/base64"
|
||||
"encoding/binary"
|
||||
|
||||
"github.com/dop251/scsu"
|
||||
"github.com/klauspost/compress/huff0"
|
||||
)
|
||||
|
||||
|
@ -11,21 +12,45 @@ import (
|
|||
// Calling Decode with the returned string should always succeed.
|
||||
// It is not a requirement that the input string is valid utf-8.
|
||||
func Encode(s string) string {
|
||||
table, payload := EncodeBytes(s)
|
||||
return string(encodeURL[table]) + base64.URLEncoding.EncodeToString(payload)
|
||||
}
|
||||
|
||||
// EncodeBytes will compress the given string and return a table identifier and a payload.
|
||||
func EncodeBytes(s string) (table byte, payload []byte) {
|
||||
initCoders()
|
||||
bestSize := len(s)
|
||||
bestTable := tableUncompressed
|
||||
bestTable := byte(tableUncompressed)
|
||||
org := []byte(s)
|
||||
bestOut := []byte(s)
|
||||
|
||||
// Try all tables and choose the best
|
||||
for i, enc := range encTables[:] {
|
||||
org := org
|
||||
if len(org) <= 1 || len(org) > maxLength {
|
||||
// Use the uncompressed
|
||||
break
|
||||
}
|
||||
|
||||
if enc == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
if i == tableSCSU {
|
||||
var err error
|
||||
olen := len(org)
|
||||
org, err = scsu.EncodeStrict(s, make([]byte, 0, len(org)))
|
||||
if err != nil || olen <= len(org) {
|
||||
continue
|
||||
}
|
||||
if len(org) < bestSize {
|
||||
// This is already better, store so we can use if the table cannot.
|
||||
bestOut = bestOut[:len(org)]
|
||||
bestTable = tableSCSUPlain
|
||||
bestSize = len(org)
|
||||
copy(bestOut, org)
|
||||
}
|
||||
}
|
||||
|
||||
// Try to encode using table.
|
||||
err := func() error {
|
||||
encTableLocks[i].Lock()
|
||||
|
@ -36,14 +61,14 @@ func Encode(s string) string {
|
|||
}
|
||||
if len(out) < bestSize {
|
||||
bestOut = bestOut[:len(out)]
|
||||
bestTable = i
|
||||
bestTable = byte(i)
|
||||
bestSize = len(out)
|
||||
copy(bestOut, out)
|
||||
}
|
||||
return nil
|
||||
}()
|
||||
// If input is a single byte repeated store as RLE or save uncompressed.
|
||||
if err == huff0.ErrUseRLE {
|
||||
if err == huff0.ErrUseRLE && i != tableSCSU {
|
||||
if len(org) > 2 {
|
||||
// Encode as one byte repeated since it will be smaller than uncompressed.
|
||||
n := binary.PutUvarint(bestOut, uint64(len(org)))
|
||||
|
@ -56,5 +81,5 @@ func Encode(s string) string {
|
|||
}
|
||||
}
|
||||
|
||||
return string(encodeURL[bestTable]) + base64.URLEncoding.EncodeToString(bestOut)
|
||||
return bestTable, bestOut
|
||||
}
|
||||
|
|
|
@ -25,7 +25,9 @@ func Fuzz(data []byte) int {
|
|||
panic(fmt.Sprintf("error decoding %q, input %q: %v", enc, string(data), err))
|
||||
}
|
||||
if !bytes.Equal(data, []byte(decoded)) {
|
||||
panic(fmt.Sprintf("decode mismatch, encoded: %q, org: %q, got: %q", enc, string(data), decoded))
|
||||
table := decodeMap[enc[0]]
|
||||
table--
|
||||
panic(fmt.Sprintf("decode mismatch, encoded: %q, org: %q, got: %q, table %d", enc, string(data), decoded, int(table)))
|
||||
}
|
||||
|
||||
// Everything is good.
|
||||
|
|
|
@ -3,21 +3,33 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"encoding/base64"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"math"
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/dop251/scsu"
|
||||
"github.com/klauspost/compress"
|
||||
"github.com/klauspost/compress/huff0"
|
||||
)
|
||||
|
||||
// Replace/add histogram data and execute go run gentable.go
|
||||
// execute go run gentable.go
|
||||
var indexFile = flag.String("index", "", "Index this file for table")
|
||||
|
||||
// Allow non-represented characters.
|
||||
var addUnused = flag.Bool("all", true, "Make all bytes possible")
|
||||
var scsuEncode = flag.Bool("scsu", false, "SCSU encode on each line before table")
|
||||
|
||||
func main() {
|
||||
// Allow non-represented characters.
|
||||
const omitUnused = false
|
||||
flag.Parse()
|
||||
|
||||
histogram := [256]uint64{
|
||||
// Replace/add histogram data and execute go run gentable.go
|
||||
// ncw home directory
|
||||
//0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 19442, 760, 0, 349, 570, 1520, 199, 76, 685, 654, 0, 40377, 1605, 395132, 935270, 0, 1156377, 887730, 811737, 712241, 693240, 689139, 675964, 656417, 666577, 657413, 532, 24, 0, 145, 0, 3, 946, 44932, 37362, 46126, 36752, 76346, 19338, 47457, 14288, 38163, 4350, 7867, 36541, 65011, 30255, 26792, 22097, 1803, 39191, 61965, 76585, 11887, 12896, 5931, 1935, 1731, 1385, 1279, 9, 1278, 1, 420185, 0, 1146359, 746359, 968896, 868703, 1393640, 745019, 354147, 159462, 483979, 169092, 75937, 385858, 322166, 466635, 571268, 447132, 13792, 446484, 736844, 732675, 170232, 112983, 63184, 142357, 173945, 21521, 250, 0, 250, 4140, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 39, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 15, 0, 0, 0, 10, 0, 5, 0, 0, 0, 0, 0, 0, 283, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
//Images:
|
||||
|
@ -26,16 +38,40 @@ func main() {
|
|||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 459, 0, 0, 7, 0, 0, 0, 7, 1, 1, 0, 2, 1, 506, 706, 0, 3903, 3552, 3694, 3338, 3262, 3257, 3222, 3249, 3325, 3261, 5, 0, 0, 1, 0, 0, 0, 48, 31, 61, 53, 46, 17, 17, 34, 32, 9, 22, 17, 31, 27, 19, 52, 5, 46, 84, 38, 14, 5, 19, 2, 2, 0, 8, 0, 8, 0, 180, 0, 5847, 3282, 3729, 3695, 3842, 3356, 316, 139, 487, 117, 95, 476, 289, 428, 609, 467, 5, 446, 592, 955, 130, 112, 57, 390, 168, 14, 0, 2, 0, 44, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
}
|
||||
|
||||
// Override with equally distributed characters
|
||||
if false {
|
||||
histogram = [256]uint64{}
|
||||
var chars string
|
||||
// base c64
|
||||
chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
|
||||
// hex
|
||||
//chars = "0123456789abcdef"
|
||||
for _, v := range []byte(chars) {
|
||||
histogram[v] = 1
|
||||
if *indexFile != "" {
|
||||
for i := range histogram[:] {
|
||||
histogram[i] = 0
|
||||
}
|
||||
b, err := ioutil.ReadFile(*indexFile)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
if *scsuEncode {
|
||||
br := bufio.NewReader(bytes.NewBuffer(b))
|
||||
var encoded []byte
|
||||
for {
|
||||
line, err := br.ReadString('\n')
|
||||
if err != nil {
|
||||
break
|
||||
}
|
||||
line = strings.TrimSpace(line)
|
||||
if len(line) < 3 || !utf8.ValidString(line) {
|
||||
continue
|
||||
}
|
||||
e, err := scsu.Encode(line, nil)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
if len(e) >= len([]byte(line)) {
|
||||
continue
|
||||
}
|
||||
encoded = append(encoded, e...)
|
||||
}
|
||||
fmt.Println("scsu", len(b), "->", len(encoded), "(excluding bigger)")
|
||||
b = encoded
|
||||
}
|
||||
for _, v := range b {
|
||||
histogram[v]++
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -49,7 +85,7 @@ func main() {
|
|||
const scale = 100 << 10
|
||||
var tmp []byte
|
||||
for i, v := range histogram[:] {
|
||||
if v == 0 && omitUnused {
|
||||
if v == 0 && !*addUnused {
|
||||
continue
|
||||
}
|
||||
nf := float64(v) / float64(total) * scale
|
||||
|
|
|
@ -26,9 +26,12 @@ var (
|
|||
|
||||
const (
|
||||
tableUncompressed = 0
|
||||
tableRLE = 61
|
||||
tableCustom = 62
|
||||
tableReserved = 63
|
||||
|
||||
tableSCSU = 59
|
||||
tableSCSUPlain = 60
|
||||
tableRLE = 61
|
||||
tableCustom = 62
|
||||
tableReserved = 63
|
||||
)
|
||||
|
||||
// predefined tables as base64 URL encoded string.
|
||||
|
@ -47,6 +50,10 @@ var tablesData = [64]string{
|
|||
5: "JRDIcQf_______8PgIiIiIgINkggARHlkQwSSCCBxHFYINHdfXI=",
|
||||
|
||||
// Special tables:
|
||||
// SCSU and a fairly generic table:
|
||||
tableSCSU: "UxAgZmEB-RYPU8hrnAk6uMgpTNQMB5MGRBx0D3T0JjyUyY-yOi5CoGgktbAktSh7d36HtPTFu7SXJ7FYw_AYmA74ZH2vWgc8O6Z5jLnWnsFqU_4B",
|
||||
// SCSU with no table...
|
||||
tableSCSUPlain: "",
|
||||
// Compressed data has its own table.
|
||||
tableCustom: "",
|
||||
// Reserved for extension.
|
||||
|
|
Loading…
Reference in a new issue