Index: /trunk/Scripts/Contributed/FontLabTokenize.py =================================================================== --- /trunk/Scripts/Contributed/FontLabTokenize.py (revision 24) +++ /trunk/Scripts/Contributed/FontLabTokenize.py (revision 24) @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- + +"""FontLab Tokenize + +Tokenize FontLab’s preview/metrics text into single characters +respecting escaped glyph names (eg. “/A.smcp”) and providing a +lossless reverse function. Sample usage (and actual test suite): + +tokenize('/A/B/C') +['/A', '/B', '/C'] +tokenize('abcde/B/C') +['a', 'b', 'c', 'd', 'e', '/B', '/C'] +tokenize('foo/A.smcp/B.smcp abc') +['f', 'o', 'o', '/A.smcp', '/B.smcp', 'a', 'b', 'c'] +p = ['f', 'o', 'o', '/A.smcp', '/B.smcp', 'a', 'b', 'c'] +serialize(p) +'foo/A.smcp/B.smcp abc' +tokenize('/a /b /c') +['/a', '/b', '/c'] +tokenize('/a/b c') +['/a', '/b', 'c'] +tokenize('@a@b@') +['@', 'a', '@', 'b', '@'] +tokenize('abc def ghi ') +['a', 'b', 'c', ' ', 'd', 'e', 'f', ' ', 'g', 'h', 'i', ' '] +p = ['a', 'b', 'c', ' ', 'd', 'e', 'f', ' ', 'g', 'h', 'i', ' '] +serialize(p) +'abc def ghi ' +serialize(['/a', 'b', '/c', 'd']) +'/a b/c d' +""" + +__author__ = 'Antonio Cavedoni ' +__version__ = '0.1' +__svnid__ = '$Id$' +__license__ = 'Python' + +def tokenize(input): + tokens = [] + escaped = [] + for i in range(len(input)): + x = input[i] + if x != '/' and not escaped: + tokens.append(x) + else: + if x == '/' and not escaped: + # append the slash so the escaped list is no longer + # false: starts capturing elements + escaped.append(x) + elif x != '/' and escaped: + if i == (len(input) - 1): + escaped.append(x) + tokens.append("".join(escaped)) + else: + if x == ' ': + tokens.append("".join(escaped)) + escaped = [] + else: + escaped.append(x) + elif x == '/' and escaped: + # starts a new sequence so, flush the escaped buffer + # and start anew + tokens.append("".join(escaped)) + escaped = [x] + + return tokens + +def serialize(tokens): + series = [] + for i in range(len(tokens)): + t = tokens[i] + if t.startswith('/') and i != (len(tokens) - 1): + if not tokens[i+1].startswith('/'): + series.append(t + ' ') + else: + series.append(t) + else: + series.append(t) + + return "".join(series) + +if __name__ == "__main__": + import doctest + doctest.testmod()