diff --git a/go.mod b/go.mod index 4f3e6cd64..13c94287a 100644 --- a/go.mod +++ b/go.mod @@ -21,6 +21,7 @@ require ( github.com/calebcase/tmpfile v1.0.2 // indirect github.com/colinmarc/hdfs/v2 v2.2.0 github.com/coreos/go-semver v0.3.0 + github.com/dop251/scsu v0.0.0-20200422003335-8fadfb689669 github.com/dropbox/dropbox-sdk-go-unofficial v5.6.0+incompatible github.com/gabriel-vasile/mimetype v1.1.2 github.com/gogo/protobuf v1.3.2 // indirect diff --git a/go.sum b/go.sum index d23f53196..135d8930f 100644 --- a/go.sum +++ b/go.sum @@ -171,11 +171,14 @@ github.com/dgrijalva/jwt-go v3.2.0+incompatible h1:7qlOGliEKZXTDg6OTjfoBKDXWrumC github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ= github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954/go.mod h1:vAd38F8PWV+bWy6jNmig1y/TA+kYO4g3RSRF0IAv0no= github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815/go.mod h1:WwZ+bS3ebgob9U8Nd0kOddGdZWjyMGR8Wziv+TBNwSE= +github.com/dop251/scsu v0.0.0-20200422003335-8fadfb689669 h1:e28M2/odOZjMc1J2ZZwgex6NM9+aqr1nMlTqPLayxbk= +github.com/dop251/scsu v0.0.0-20200422003335-8fadfb689669/go.mod h1:Gth7Xev0h28tuTayG4HlTZy90IXhiDgV2+MLtJzjpP0= github.com/dropbox/dropbox-sdk-go-unofficial v5.6.0+incompatible h1:DtumzkLk2zZ2SeElEr+VNz+zV7l+BTe509cV4sKPXbM= github.com/dropbox/dropbox-sdk-go-unofficial v5.6.0+incompatible/go.mod h1:lr+LhMM3F6Y3lW1T9j2U5l7QeuWm87N9+PPXo3yH4qY= github.com/dustin/go-humanize v0.0.0-20171111073723-bb3d318650d4/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= github.com/dustin/go-humanize v0.0.0-20180421182945-02af3965c54e/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= +github.com/dvyukov/go-fuzz v0.0.0-20200318091601-be3528f3a813 h1:NgO45/5mBLRVfiXerEFzH6ikcZ7DNRPS639xFg3ENzU= github.com/dvyukov/go-fuzz v0.0.0-20200318091601-be3528f3a813/go.mod h1:11Gm+ccJnvAhCNLlf5+cS9KjtbaD5I5zaZpFMsTHWTw= github.com/eapache/go-resiliency v1.1.0/go.mod h1:kFI+JgMyC7bLPUVY133qvEBtVayf5mFgVsvEsIPBvNs= github.com/eapache/go-xerial-snappy v0.0.0-20180814174437-776d5712da21/go.mod h1:+020luEh2TKB4/GOp8oxxtq0Daoen/Cii55CzbTV6DU= diff --git a/lib/encoder/filename/decode.go b/lib/encoder/filename/decode.go index 75f87ccd5..6e859fd31 100644 --- a/lib/encoder/filename/decode.go +++ b/lib/encoder/filename/decode.go @@ -7,6 +7,7 @@ import ( "errors" "sync" + "github.com/dop251/scsu" "github.com/klauspost/compress/huff0" ) @@ -22,6 +23,7 @@ var customDecMu sync.Mutex // Decode an encoded string. func Decode(s string) (string, error) { + initCoders() if len(s) < 1 { return "", ErrCorrupted } @@ -31,19 +33,25 @@ func Decode(s string) (string, error) { } table-- s = s[1:] - data := make([]byte, base64.URLEncoding.DecodedLen(len(s))) n, err := base64.URLEncoding.Decode(data, ([]byte)(s)) if err != nil || n < 0 { return "", ErrCorrupted } data = data[:n] + return DecodeBytes(table, data) +} +// DecodeBytes will decode raw id and data values. +func DecodeBytes(table byte, data []byte) (string, error) { + initCoders() switch table { case tableUncompressed: return string(data), nil case tableReserved: return "", ErrUnsupported + case tableSCSUPlain: + return scsu.Decode(data) case tableRLE: if len(data) < 2 { return "", ErrCorrupted @@ -79,6 +87,9 @@ func Decode(s string) (string, error) { if err != nil { return "", ErrCorrupted } + if table == tableSCSU { + return scsu.Decode(name) + } return string(name), nil } } diff --git a/lib/encoder/filename/decode_test.go b/lib/encoder/filename/decode_test.go new file mode 100644 index 000000000..6d377ac09 --- /dev/null +++ b/lib/encoder/filename/decode_test.go @@ -0,0 +1,92 @@ +package filename + +import "testing" + +func TestDecode(t *testing.T) { + tests := []struct { + name string + encoded string + want string + wantErr bool + }{ + { + name: "unicode-1", + encoded: "8D5V3MESVd-WEF7WuqaOvpKUWtYGEyw5UDQ==", + want: "長い長いUNICODEファイル名", + wantErr: false, + }, + { + name: "unicode-2", + encoded: "8GyHV1N7u2OEg4ufQ3eHQ3Ngg6N3X0CDg4-HX0NXU2tg=", + want: "ვეპხის ტყაოსანი შოთა რუსთაველი", + wantErr: false, + }, + { + name: "unicode-3", + encoded: "7LpehMXOrWe7mcT_lpf2MN1Nmgu55jpXHLavZcXJb2UTJ-UmGU15iznkD", + want: "Sønderjysk: Æ ka æe glass uhen at det go mæ naue.,", + wantErr: false, + }, + { + name: "unicode-4", + encoded: "7TCSRm0liJDR0ulpBq4Lla_XB2mWdLFMEs8wEQKHAGa8FRr333ntJ6Ww6_f__N5VKeYM=", + want: "Hello------world 時危兵甲滿天涯,載道流離起怨咨.bin", + }, + { + name: "plain-1", + encoded: "BzGQYxqHBA6ljTsir80gUM5Y=", + want: "-Duplican99E8ZI4___9_", + wantErr: false, + }, + { + name: "hex-1", + encoded: "D_--tHZROQpqqJ9PafqNa6STF", + want: "13646871dfabbs43323564654bbefff", + wantErr: false, + }, + { + name: "base64-1", + encoded: "FMpABB9Ef0KP8OrVxjnE3LzUePuLZi8pPg7eW8bgyW2d3Ucckf4rlE0mkAvlILVpOmF3L-rFbmNrpUO2HQFlF4SCMPVPeCEX6LeOg5JVpUVCXV1WSazD9vSpr", + want: "UxAYiB0FNTTkXRw9P8hwq-WmN7tYwbe-sFw8C3snDRG1d-yjrdOUVZQyLdtkJ8tuvhBSnuBiLjVieCAroWEZDIO4Hb_rKgdzPjMqFE7inwHJ2isF==", + wantErr: false, + }, + { + name: "custom-1", + encoded: "-BeADJCoG_________________xc=", + want: "Uaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + wantErr: false, + }, + { + name: "rle-1", + encoded: "9a2E=", + want: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + wantErr: false, + }, + { + name: "regular-1", + encoded: "BeSSrnzj0j3OXyR9K81M=", + want: "regular-filename.txt", + wantErr: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := Decode(tt.encoded) + if (err != nil) != tt.wantErr { + if tt.encoded == "" && tt.want != "" { + proposed := Encode(tt.want) + table := decodeMap[proposed[0]] - 1 + t.Errorf("No encoded value, try '%s', table is %d", proposed, table) + return + } + t.Errorf("Decode() error = %v, wantErr %v", err, tt.wantErr) + return + } + + if got != tt.want { + t.Errorf("Decode() got = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/lib/encoder/filename/encode.go b/lib/encoder/filename/encode.go index 555bf6b91..031dbd07a 100644 --- a/lib/encoder/filename/encode.go +++ b/lib/encoder/filename/encode.go @@ -4,6 +4,7 @@ import ( "encoding/base64" "encoding/binary" + "github.com/dop251/scsu" "github.com/klauspost/compress/huff0" ) @@ -11,21 +12,45 @@ import ( // Calling Decode with the returned string should always succeed. // It is not a requirement that the input string is valid utf-8. func Encode(s string) string { + table, payload := EncodeBytes(s) + return string(encodeURL[table]) + base64.URLEncoding.EncodeToString(payload) +} + +// EncodeBytes will compress the given string and return a table identifier and a payload. +func EncodeBytes(s string) (table byte, payload []byte) { initCoders() bestSize := len(s) - bestTable := tableUncompressed + bestTable := byte(tableUncompressed) org := []byte(s) bestOut := []byte(s) - // Try all tables and choose the best for i, enc := range encTables[:] { + org := org if len(org) <= 1 || len(org) > maxLength { // Use the uncompressed break } + if enc == nil { continue } + + if i == tableSCSU { + var err error + olen := len(org) + org, err = scsu.EncodeStrict(s, make([]byte, 0, len(org))) + if err != nil || olen <= len(org) { + continue + } + if len(org) < bestSize { + // This is already better, store so we can use if the table cannot. + bestOut = bestOut[:len(org)] + bestTable = tableSCSUPlain + bestSize = len(org) + copy(bestOut, org) + } + } + // Try to encode using table. err := func() error { encTableLocks[i].Lock() @@ -36,14 +61,14 @@ func Encode(s string) string { } if len(out) < bestSize { bestOut = bestOut[:len(out)] - bestTable = i + bestTable = byte(i) bestSize = len(out) copy(bestOut, out) } return nil }() // If input is a single byte repeated store as RLE or save uncompressed. - if err == huff0.ErrUseRLE { + if err == huff0.ErrUseRLE && i != tableSCSU { if len(org) > 2 { // Encode as one byte repeated since it will be smaller than uncompressed. n := binary.PutUvarint(bestOut, uint64(len(org))) @@ -56,5 +81,5 @@ func Encode(s string) string { } } - return string(encodeURL[bestTable]) + base64.URLEncoding.EncodeToString(bestOut) + return bestTable, bestOut } diff --git a/lib/encoder/filename/fuzz.go b/lib/encoder/filename/fuzz.go index 73468ee71..5983a407f 100644 --- a/lib/encoder/filename/fuzz.go +++ b/lib/encoder/filename/fuzz.go @@ -25,7 +25,9 @@ func Fuzz(data []byte) int { panic(fmt.Sprintf("error decoding %q, input %q: %v", enc, string(data), err)) } if !bytes.Equal(data, []byte(decoded)) { - panic(fmt.Sprintf("decode mismatch, encoded: %q, org: %q, got: %q", enc, string(data), decoded)) + table := decodeMap[enc[0]] + table-- + panic(fmt.Sprintf("decode mismatch, encoded: %q, org: %q, got: %q, table %d", enc, string(data), decoded, int(table))) } // Everything is good. diff --git a/lib/encoder/filename/gentable.go b/lib/encoder/filename/gentable.go index 4acb61707..5475c60ab 100644 --- a/lib/encoder/filename/gentable.go +++ b/lib/encoder/filename/gentable.go @@ -3,21 +3,33 @@ package main import ( + "bufio" + "bytes" "encoding/base64" + "flag" "fmt" + "io/ioutil" "math" + "strings" + "unicode/utf8" + "github.com/dop251/scsu" "github.com/klauspost/compress" "github.com/klauspost/compress/huff0" ) -// Replace/add histogram data and execute go run gentable.go +// execute go run gentable.go +var indexFile = flag.String("index", "", "Index this file for table") + +// Allow non-represented characters. +var addUnused = flag.Bool("all", true, "Make all bytes possible") +var scsuEncode = flag.Bool("scsu", false, "SCSU encode on each line before table") func main() { - // Allow non-represented characters. - const omitUnused = false + flag.Parse() histogram := [256]uint64{ + // Replace/add histogram data and execute go run gentable.go // ncw home directory //0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 19442, 760, 0, 349, 570, 1520, 199, 76, 685, 654, 0, 40377, 1605, 395132, 935270, 0, 1156377, 887730, 811737, 712241, 693240, 689139, 675964, 656417, 666577, 657413, 532, 24, 0, 145, 0, 3, 946, 44932, 37362, 46126, 36752, 76346, 19338, 47457, 14288, 38163, 4350, 7867, 36541, 65011, 30255, 26792, 22097, 1803, 39191, 61965, 76585, 11887, 12896, 5931, 1935, 1731, 1385, 1279, 9, 1278, 1, 420185, 0, 1146359, 746359, 968896, 868703, 1393640, 745019, 354147, 159462, 483979, 169092, 75937, 385858, 322166, 466635, 571268, 447132, 13792, 446484, 736844, 732675, 170232, 112983, 63184, 142357, 173945, 21521, 250, 0, 250, 4140, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 39, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 15, 0, 0, 0, 10, 0, 5, 0, 0, 0, 0, 0, 0, 283, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //Images: @@ -26,16 +38,40 @@ func main() { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 459, 0, 0, 7, 0, 0, 0, 7, 1, 1, 0, 2, 1, 506, 706, 0, 3903, 3552, 3694, 3338, 3262, 3257, 3222, 3249, 3325, 3261, 5, 0, 0, 1, 0, 0, 0, 48, 31, 61, 53, 46, 17, 17, 34, 32, 9, 22, 17, 31, 27, 19, 52, 5, 46, 84, 38, 14, 5, 19, 2, 2, 0, 8, 0, 8, 0, 180, 0, 5847, 3282, 3729, 3695, 3842, 3356, 316, 139, 487, 117, 95, 476, 289, 428, 609, 467, 5, 446, 592, 955, 130, 112, 57, 390, 168, 14, 0, 2, 0, 44, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, } - // Override with equally distributed characters - if false { - histogram = [256]uint64{} - var chars string - // base c64 - chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_" - // hex - //chars = "0123456789abcdef" - for _, v := range []byte(chars) { - histogram[v] = 1 + if *indexFile != "" { + for i := range histogram[:] { + histogram[i] = 0 + } + b, err := ioutil.ReadFile(*indexFile) + if err != nil { + panic(err) + } + if *scsuEncode { + br := bufio.NewReader(bytes.NewBuffer(b)) + var encoded []byte + for { + line, err := br.ReadString('\n') + if err != nil { + break + } + line = strings.TrimSpace(line) + if len(line) < 3 || !utf8.ValidString(line) { + continue + } + e, err := scsu.Encode(line, nil) + if err != nil { + panic(err) + } + if len(e) >= len([]byte(line)) { + continue + } + encoded = append(encoded, e...) + } + fmt.Println("scsu", len(b), "->", len(encoded), "(excluding bigger)") + b = encoded + } + for _, v := range b { + histogram[v]++ } } @@ -49,7 +85,7 @@ func main() { const scale = 100 << 10 var tmp []byte for i, v := range histogram[:] { - if v == 0 && omitUnused { + if v == 0 && !*addUnused { continue } nf := float64(v) / float64(total) * scale diff --git a/lib/encoder/filename/init.go b/lib/encoder/filename/init.go index 1a7ae41ac..6f6bad147 100644 --- a/lib/encoder/filename/init.go +++ b/lib/encoder/filename/init.go @@ -26,9 +26,12 @@ var ( const ( tableUncompressed = 0 - tableRLE = 61 - tableCustom = 62 - tableReserved = 63 + + tableSCSU = 59 + tableSCSUPlain = 60 + tableRLE = 61 + tableCustom = 62 + tableReserved = 63 ) // predefined tables as base64 URL encoded string. @@ -47,6 +50,10 @@ var tablesData = [64]string{ 5: "JRDIcQf_______8PgIiIiIgINkggARHlkQwSSCCBxHFYINHdfXI=", // Special tables: + // SCSU and a fairly generic table: + tableSCSU: "UxAgZmEB-RYPU8hrnAk6uMgpTNQMB5MGRBx0D3T0JjyUyY-yOi5CoGgktbAktSh7d36HtPTFu7SXJ7FYw_AYmA74ZH2vWgc8O6Z5jLnWnsFqU_4B", + // SCSU with no table... + tableSCSUPlain: "", // Compressed data has its own table. tableCustom: "", // Reserved for extension.