// Copyright 2012 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package colltab import "unicode/utf8" // For a description of ContractTrieSet, see text/collate/build/contract.go. type ContractTrieSet []struct{ L, H, N, I uint8 } // ctScanner is used to match a trie to an input sequence. // A contraction may match a non-contiguous sequence of bytes in an input string. // For example, if there is a contraction for <a, combining_ring>, it should match // the sequence <a, combining_cedilla, combining_ring>, as combining_cedilla does // not block combining_ring. // ctScanner does not automatically skip over non-blocking non-starters, but rather // retains the state of the last match and leaves it up to the user to continue // the match at the appropriate points. type ctScanner struct { states ContractTrieSet s []byte n int index int pindex int done bool } type ctScannerString struct { states ContractTrieSet s string n int index int pindex int done bool } func (t ContractTrieSet) scanner(index, n int, b []byte) ctScanner { return ctScanner{s: b, states: t[index:], n: n} } func (t ContractTrieSet) scannerString(index, n int, str string) ctScannerString { return ctScannerString{s: str, states: t[index:], n: n} } // result returns the offset i and bytes consumed p so far. If no suffix // matched, i and p will be 0. func (s *ctScanner) result() (i, p int) { return s.index, s.pindex } func (s *ctScannerString) result() (i, p int) { return s.index, s.pindex } const ( final = 0 noIndex = 0xFF ) // scan matches the longest suffix at the current location in the input // and returns the number of bytes consumed. func (s *ctScanner) scan(p int) int { pr := p // the p at the rune start str := s.s states, n := s.states, s.n for i := 0; i < n && p < len(str); { e := states[i] c := str[p] // TODO: a significant number of contractions are of a form that // cannot match discontiguous UTF-8 in a normalized string. We could let // a negative value of e.n mean that we can set s.done = true and avoid // the need for additional matches. if c >= e.L { if e.L == c { p++ if e.I != noIndex { s.index = int(e.I) s.pindex = p } if e.N != final { i, states, n = 0, states[int(e.H)+n:], int(e.N) if p >= len(str) || utf8.RuneStart(str[p]) { s.states, s.n, pr = states, n, p } } else { s.done = true return p } continue } else if e.N == final && c <= e.H { p++ s.done = true s.index = int(c-e.L) + int(e.I) s.pindex = p return p } } i++ } return pr } // scan is a verbatim copy of ctScanner.scan. func (s *ctScannerString) scan(p int) int { pr := p // the p at the rune start str := s.s states, n := s.states, s.n for i := 0; i < n && p < len(str); { e := states[i] c := str[p] // TODO: a significant number of contractions are of a form that // cannot match discontiguous UTF-8 in a normalized string. We could let // a negative value of e.n mean that we can set s.done = true and avoid // the need for additional matches. if c >= e.L { if e.L == c { p++ if e.I != noIndex { s.index = int(e.I) s.pindex = p } if e.N != final { i, states, n = 0, states[int(e.H)+n:], int(e.N) if p >= len(str) || utf8.RuneStart(str[p]) { s.states, s.n, pr = states, n, p } } else { s.done = true return p } continue } else if e.N == final && c <= e.H { p++ s.done = true s.index = int(c-e.L) + int(e.I) s.pindex = p return p } } i++ } return pr }