forked from TrueCloudLab/rclone
513 lines
14 KiB
Go
513 lines
14 KiB
Go
// Copyright 2016 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// +build ignore
|
|
|
|
package main
|
|
|
|
// This file generates data for the CLDR plural rules, as defined in
|
|
// http://unicode.org/reports/tr35/tr35-numbers.html#Language_Plural_Rules
|
|
//
|
|
// We assume a slightly simplified grammar:
|
|
//
|
|
// condition = and_condition ('or' and_condition)* samples
|
|
// and_condition = relation ('and' relation)*
|
|
// relation = expr ('=' | '!=') range_list
|
|
// expr = operand ('%' '10' '0'* )?
|
|
// operand = 'n' | 'i' | 'f' | 't' | 'v' | 'w'
|
|
// range_list = (range | value) (',' range_list)*
|
|
// range = value'..'value
|
|
// value = digit+
|
|
// digit = 0|1|2|3|4|5|6|7|8|9
|
|
//
|
|
// samples = ('@integer' sampleList)?
|
|
// ('@decimal' sampleList)?
|
|
// sampleList = sampleRange (',' sampleRange)* (',' ('…'|'...'))?
|
|
// sampleRange = decimalValue ('~' decimalValue)?
|
|
// decimalValue = value ('.' value)?
|
|
//
|
|
// Symbol Value
|
|
// n absolute value of the source number (integer and decimals).
|
|
// i integer digits of n.
|
|
// v number of visible fraction digits in n, with trailing zeros.
|
|
// w number of visible fraction digits in n, without trailing zeros.
|
|
// f visible fractional digits in n, with trailing zeros.
|
|
// t visible fractional digits in n, without trailing zeros.
|
|
//
|
|
// The algorithm for which the data is generated is based on the following
|
|
// observations
|
|
//
|
|
// - the number of different sets of numbers which the plural rules use to
|
|
// test inclusion is limited,
|
|
// - most numbers that are tested on are < 100
|
|
//
|
|
// This allows us to define a bitmap for each number < 100 where a bit i
|
|
// indicates whether this number is included in some defined set i.
|
|
// The function matchPlural in plural.go defines how we can subsequently use
|
|
// this data to determine inclusion.
|
|
//
|
|
// There are a few languages for which this doesn't work. For one Italian and
|
|
// Azerbaijan, which both test against numbers > 100 for ordinals and Breton,
|
|
// which considers whether numbers are multiples of hundreds. The model here
|
|
// could be extended to handle Italian and Azerbaijan fairly easily (by
|
|
// considering the numbers 100, 200, 300, ..., 800, 900 in addition to the first
|
|
// 100), but for now it seems easier to just hard-code these cases.
|
|
|
|
import (
|
|
"bufio"
|
|
"bytes"
|
|
"flag"
|
|
"fmt"
|
|
"log"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"golang.org/x/text/internal"
|
|
"golang.org/x/text/internal/gen"
|
|
"golang.org/x/text/language"
|
|
"golang.org/x/text/unicode/cldr"
|
|
)
|
|
|
|
var (
|
|
test = flag.Bool("test", false,
|
|
"test existing tables; can be used to compare web data with package data.")
|
|
outputFile = flag.String("output", "tables.go", "output file")
|
|
outputTestFile = flag.String("testoutput", "data_test.go", "output file")
|
|
|
|
draft = flag.String("draft",
|
|
"contributed",
|
|
`Minimal draft requirements (approved, contributed, provisional, unconfirmed).`)
|
|
)
|
|
|
|
func main() {
|
|
gen.Init()
|
|
|
|
const pkg = "plural"
|
|
|
|
gen.Repackage("gen_common.go", "common.go", pkg)
|
|
// Read the CLDR zip file.
|
|
r := gen.OpenCLDRCoreZip()
|
|
defer r.Close()
|
|
|
|
d := &cldr.Decoder{}
|
|
d.SetDirFilter("supplemental", "main")
|
|
d.SetSectionFilter("numbers", "plurals")
|
|
data, err := d.DecodeZip(r)
|
|
if err != nil {
|
|
log.Fatalf("DecodeZip: %v", err)
|
|
}
|
|
|
|
w := gen.NewCodeWriter()
|
|
defer w.WriteGoFile(*outputFile, pkg)
|
|
|
|
gen.WriteCLDRVersion(w)
|
|
|
|
genPlurals(w, data)
|
|
|
|
w = gen.NewCodeWriter()
|
|
defer w.WriteGoFile(*outputTestFile, pkg)
|
|
|
|
genPluralsTests(w, data)
|
|
}
|
|
|
|
type pluralTest struct {
|
|
locales string // space-separated list of locales for this test
|
|
form int // Use int instead of Form to simplify generation.
|
|
integer []string // Entries of the form \d+ or \d+~\d+
|
|
decimal []string // Entries of the form \f+ or \f+ +~\f+, where f is \d+\.\d+
|
|
}
|
|
|
|
func genPluralsTests(w *gen.CodeWriter, data *cldr.CLDR) {
|
|
w.WriteType(pluralTest{})
|
|
|
|
for _, plurals := range data.Supplemental().Plurals {
|
|
if plurals.Type == "" {
|
|
// The empty type is reserved for plural ranges.
|
|
continue
|
|
}
|
|
tests := []pluralTest{}
|
|
|
|
for _, pRules := range plurals.PluralRules {
|
|
for _, rule := range pRules.PluralRule {
|
|
test := pluralTest{
|
|
locales: pRules.Locales,
|
|
form: int(countMap[rule.Count]),
|
|
}
|
|
scan := bufio.NewScanner(strings.NewReader(rule.Data()))
|
|
scan.Split(splitTokens)
|
|
var p *[]string
|
|
for scan.Scan() {
|
|
switch t := scan.Text(); t {
|
|
case "@integer":
|
|
p = &test.integer
|
|
case "@decimal":
|
|
p = &test.decimal
|
|
case ",", "…":
|
|
default:
|
|
if p != nil {
|
|
*p = append(*p, t)
|
|
}
|
|
}
|
|
}
|
|
tests = append(tests, test)
|
|
}
|
|
}
|
|
w.WriteVar(plurals.Type+"Tests", tests)
|
|
}
|
|
}
|
|
|
|
func genPlurals(w *gen.CodeWriter, data *cldr.CLDR) {
|
|
for _, plurals := range data.Supplemental().Plurals {
|
|
if plurals.Type == "" {
|
|
continue
|
|
}
|
|
// Initialize setMap and inclusionMasks. They are already populated with
|
|
// a few entries to serve as an example and to assign nice numbers to
|
|
// common cases.
|
|
|
|
// setMap contains sets of numbers represented by boolean arrays where
|
|
// a true value for element i means that the number i is included.
|
|
setMap := map[[numN]bool]int{
|
|
// The above init func adds an entry for including all numbers.
|
|
[numN]bool{1: true}: 1, // fix {1} to a nice value
|
|
[numN]bool{2: true}: 2, // fix {2} to a nice value
|
|
[numN]bool{0: true}: 3, // fix {0} to a nice value
|
|
}
|
|
|
|
// inclusionMasks contains bit masks for every number under numN to
|
|
// indicate in which set the number is included. Bit 1 << x will be set
|
|
// if it is included in set x.
|
|
inclusionMasks := [numN]uint64{
|
|
// Note: these entries are not complete: more bits will be set along the way.
|
|
0: 1 << 3,
|
|
1: 1 << 1,
|
|
2: 1 << 2,
|
|
}
|
|
|
|
// Create set {0..99}. We will assign this set the identifier 0.
|
|
var all [numN]bool
|
|
for i := range all {
|
|
// Mark number i as being included in the set (which has identifier 0).
|
|
inclusionMasks[i] |= 1 << 0
|
|
// Mark number i as included in the set.
|
|
all[i] = true
|
|
}
|
|
// Register the identifier for the set.
|
|
setMap[all] = 0
|
|
|
|
rules := []pluralCheck{}
|
|
index := []byte{0}
|
|
langMap := map[int]byte{0: 0} // From compact language index to index
|
|
|
|
for _, pRules := range plurals.PluralRules {
|
|
// Parse the rules.
|
|
var conds []orCondition
|
|
for _, rule := range pRules.PluralRule {
|
|
form := countMap[rule.Count]
|
|
conds = parsePluralCondition(conds, rule.Data(), form)
|
|
}
|
|
// Encode the rules.
|
|
for _, c := range conds {
|
|
// If an or condition only has filters, we create an entry for
|
|
// this filter and the set that contains all values.
|
|
empty := true
|
|
for _, b := range c.used {
|
|
empty = empty && !b
|
|
}
|
|
if empty {
|
|
rules = append(rules, pluralCheck{
|
|
cat: byte(opMod<<opShift) | byte(c.form),
|
|
setID: 0, // all values
|
|
})
|
|
continue
|
|
}
|
|
// We have some entries with values.
|
|
for i, set := range c.set {
|
|
if !c.used[i] {
|
|
continue
|
|
}
|
|
index, ok := setMap[set]
|
|
if !ok {
|
|
index = len(setMap)
|
|
setMap[set] = index
|
|
for i := range inclusionMasks {
|
|
if set[i] {
|
|
inclusionMasks[i] |= 1 << uint64(index)
|
|
}
|
|
}
|
|
}
|
|
rules = append(rules, pluralCheck{
|
|
cat: byte(i<<opShift | andNext),
|
|
setID: byte(index),
|
|
})
|
|
}
|
|
// Now set the last entry to the plural form the rule matches.
|
|
rules[len(rules)-1].cat &^= formMask
|
|
rules[len(rules)-1].cat |= byte(c.form)
|
|
}
|
|
// Point the relevant locales to the created entries.
|
|
for _, loc := range strings.Split(pRules.Locales, " ") {
|
|
if strings.TrimSpace(loc) == "" {
|
|
continue
|
|
}
|
|
lang, ok := language.CompactIndex(language.MustParse(loc))
|
|
if !ok {
|
|
log.Printf("No compact index for locale %q", loc)
|
|
}
|
|
langMap[lang] = byte(len(index) - 1)
|
|
}
|
|
index = append(index, byte(len(rules)))
|
|
}
|
|
w.WriteVar(plurals.Type+"Rules", rules)
|
|
w.WriteVar(plurals.Type+"Index", index)
|
|
// Expand the values.
|
|
langToIndex := make([]byte, language.NumCompactTags)
|
|
for i := range langToIndex {
|
|
for p := i; ; p = int(internal.Parent[p]) {
|
|
if x, ok := langMap[p]; ok {
|
|
langToIndex[i] = x
|
|
break
|
|
}
|
|
}
|
|
}
|
|
w.WriteVar(plurals.Type+"LangToIndex", langToIndex)
|
|
// Need to convert array to slice because of golang.org/issue/7651.
|
|
// This will allow tables to be dropped when unused. This is especially
|
|
// relevant for the ordinal data, which I suspect won't be used as much.
|
|
w.WriteVar(plurals.Type+"InclusionMasks", inclusionMasks[:])
|
|
|
|
if len(rules) > 0xFF {
|
|
log.Fatalf("Too many entries for rules: %#x", len(rules))
|
|
}
|
|
if len(index) > 0xFF {
|
|
log.Fatalf("Too many entries for index: %#x", len(index))
|
|
}
|
|
if len(setMap) > 64 { // maximum number of bits.
|
|
log.Fatalf("Too many entries for setMap: %d", len(setMap))
|
|
}
|
|
w.WriteComment(
|
|
"Slots used for %s: %X of 0xFF rules; %X of 0xFF indexes; %d of 64 sets",
|
|
plurals.Type, len(rules), len(index), len(setMap))
|
|
// Prevent comment from attaching to the next entry.
|
|
fmt.Fprint(w, "\n\n")
|
|
}
|
|
}
|
|
|
|
type orCondition struct {
|
|
original string // for debugging
|
|
|
|
form Form
|
|
used [32]bool
|
|
set [32][numN]bool
|
|
}
|
|
|
|
func (o *orCondition) add(op opID, mod int, v []int) (ok bool) {
|
|
ok = true
|
|
for _, x := range v {
|
|
if x >= maxMod {
|
|
ok = false
|
|
break
|
|
}
|
|
}
|
|
for i := 0; i < numN; i++ {
|
|
m := i
|
|
if mod != 0 {
|
|
m = i % mod
|
|
}
|
|
if !intIn(m, v) {
|
|
o.set[op][i] = false
|
|
}
|
|
}
|
|
if ok {
|
|
o.used[op] = true
|
|
}
|
|
return ok
|
|
}
|
|
|
|
func intIn(x int, a []int) bool {
|
|
for _, y := range a {
|
|
if x == y {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
var operandIndex = map[string]opID{
|
|
"i": opI,
|
|
"n": opN,
|
|
"f": opF,
|
|
"v": opV,
|
|
"w": opW,
|
|
}
|
|
|
|
// parsePluralCondition parses the condition of a single pluralRule and appends
|
|
// the resulting or conditions to conds.
|
|
//
|
|
// Example rules:
|
|
// // Category "one" in English: only allow 1 with no visible fraction
|
|
// i = 1 and v = 0 @integer 1
|
|
//
|
|
// // Category "few" in Czech: all numbers with visible fractions
|
|
// v != 0 @decimal ...
|
|
//
|
|
// // Category "zero" in Latvian: all multiples of 10 or the numbers 11-19 or
|
|
// // numbers with a fraction 11..19 and no trailing zeros.
|
|
// n % 10 = 0 or n % 100 = 11..19 or v = 2 and f % 100 = 11..19 @integer ...
|
|
//
|
|
// @integer and @decimal are followed by examples and are not relevant for the
|
|
// rule itself. The are used here to signal the termination of the rule.
|
|
func parsePluralCondition(conds []orCondition, s string, f Form) []orCondition {
|
|
scan := bufio.NewScanner(strings.NewReader(s))
|
|
scan.Split(splitTokens)
|
|
for {
|
|
cond := orCondition{original: s, form: f}
|
|
// Set all numbers to be allowed for all number classes and restrict
|
|
// from here on.
|
|
for i := range cond.set {
|
|
for j := range cond.set[i] {
|
|
cond.set[i][j] = true
|
|
}
|
|
}
|
|
andLoop:
|
|
for {
|
|
var token string
|
|
scan.Scan() // Must exist.
|
|
switch class := scan.Text(); class {
|
|
case "t":
|
|
class = "w" // equal to w for t == 0
|
|
fallthrough
|
|
case "n", "i", "f", "v", "w":
|
|
op := scanToken(scan)
|
|
opCode := operandIndex[class]
|
|
mod := 0
|
|
if op == "%" {
|
|
opCode |= opMod
|
|
|
|
switch v := scanUint(scan); v {
|
|
case 10, 100:
|
|
mod = v
|
|
case 1000:
|
|
// A more general solution would be to allow checking
|
|
// against multiples of 100 and include entries for the
|
|
// numbers 100..900 in the inclusion masks. At the
|
|
// moment this would only help Azerbaijan and Italian.
|
|
|
|
// Italian doesn't use '%', so this must be Azerbaijan.
|
|
cond.used[opAzerbaijan00s] = true
|
|
return append(conds, cond)
|
|
|
|
case 1000000:
|
|
cond.used[opBretonM] = true
|
|
return append(conds, cond)
|
|
|
|
default:
|
|
log.Fatalf("Modulo value not supported %d", v)
|
|
}
|
|
op = scanToken(scan)
|
|
}
|
|
if op != "=" && op != "!=" {
|
|
log.Fatalf("Unexpected op %q", op)
|
|
}
|
|
if op == "!=" {
|
|
opCode |= opNotEqual
|
|
}
|
|
a := []int{}
|
|
v := scanUint(scan)
|
|
if class == "w" && v != 0 {
|
|
log.Fatalf("Must compare against zero for operand type %q", class)
|
|
}
|
|
token = scanToken(scan)
|
|
for {
|
|
switch token {
|
|
case "..":
|
|
end := scanUint(scan)
|
|
for ; v <= end; v++ {
|
|
a = append(a, v)
|
|
}
|
|
token = scanToken(scan)
|
|
default: // ",", "or", "and", "@..."
|
|
a = append(a, v)
|
|
}
|
|
if token != "," {
|
|
break
|
|
}
|
|
v = scanUint(scan)
|
|
token = scanToken(scan)
|
|
}
|
|
if !cond.add(opCode, mod, a) {
|
|
// Detected large numbers. As we ruled out Azerbaijan, this
|
|
// must be the many rule for Italian ordinals.
|
|
cond.set[opItalian800] = cond.set[opN]
|
|
cond.used[opItalian800] = true
|
|
}
|
|
|
|
case "@integer", "@decimal": // "other" entry: tests only.
|
|
return conds
|
|
default:
|
|
log.Fatalf("Unexpected operand class %q (%s)", class, s)
|
|
}
|
|
switch token {
|
|
case "or":
|
|
conds = append(conds, cond)
|
|
break andLoop
|
|
case "@integer", "@decimal": // examples
|
|
// There is always an example in practice, so we always terminate here.
|
|
if err := scan.Err(); err != nil {
|
|
log.Fatal(err)
|
|
}
|
|
return append(conds, cond)
|
|
case "and":
|
|
// keep accumulating
|
|
default:
|
|
log.Fatalf("Unexpected token %q", token)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func scanToken(scan *bufio.Scanner) string {
|
|
scan.Scan()
|
|
return scan.Text()
|
|
}
|
|
|
|
func scanUint(scan *bufio.Scanner) int {
|
|
scan.Scan()
|
|
val, err := strconv.ParseUint(scan.Text(), 10, 32)
|
|
if err != nil {
|
|
log.Fatal(err)
|
|
}
|
|
return int(val)
|
|
}
|
|
|
|
// splitTokens can be used with bufio.Scanner to tokenize CLDR plural rules.
|
|
func splitTokens(data []byte, atEOF bool) (advance int, token []byte, err error) {
|
|
condTokens := [][]byte{
|
|
[]byte(".."),
|
|
[]byte(","),
|
|
[]byte("!="),
|
|
[]byte("="),
|
|
}
|
|
advance, token, err = bufio.ScanWords(data, atEOF)
|
|
for _, t := range condTokens {
|
|
if len(t) >= len(token) {
|
|
continue
|
|
}
|
|
switch p := bytes.Index(token, t); {
|
|
case p == -1:
|
|
case p == 0:
|
|
advance = len(t)
|
|
token = token[:len(t)]
|
|
return advance - len(token) + len(t), token[:len(t)], err
|
|
case p < advance:
|
|
// Don't split when "=" overlaps "!=".
|
|
if t[0] == '=' && token[p-1] == '!' {
|
|
continue
|
|
}
|
|
advance = p
|
|
token = token[:p]
|
|
}
|
|
}
|
|
return advance, token, err
|
|
}
|