enbug diary(2007-02-13)

2007-02-13

_ Suffix Array

odzさんのSuffix Array を三分間ハッキングで高速化してみる。

import sys

def build_suffix_array(text, indices = None):
    if indices is None:
        indices = range(len(text))

    length = len(text)
    def suffix_compare(a, b):
        m = min(a, b)
        for i in xrange(0, length - m + 63, 64):
            c = cmp(text[a+i:a+i+64], text[b+i:b+i+64])
            if c: return c
        return 0

    indices.sort(suffix_compare)
    return indices

def read_file(filename):
    fp = file(filename)
    try:
        return fp.read()
    finally:
        fp.close()

def suffix(text, index, max_length):
    last = text.find('\n', index, index + max_length)
    if last != -1:
        return text[index:last]
    else:
        return text[index:index + max_length]

def main(args, options):
    if args:
        text = ''.join(read_file(f) for f in args)
    else:
        text = sys.stdin.read()

    if options.encoding:
        text = text.decode(options.encoding)

    suffix_array = build_suffix_array(text)
    for i in suffix_array:
        sys.stdout.write('%d\t%s\n' % (i, suffix(text, i, 10).encode('utf-8')))

if __name__ == '__main__':
    from optparse import OptionParser

    parser = OptionParser()
    parser.add_option('-e', '--encoding', dest='encoding',
                      help='input encoding', metavar='encoding')
    (options, args) = parser.parse_args()

    main(args, options)

元ネタの問題は文字列のコピーを作りすぎるところにあるので、小部分の比較に置き換えれば十分に速くなる。 Pythonでは文字列の範囲外は空文字列にしてくれるので、明示的なチェックはこの場合不要。

こういう姑息な最適化をするとPsycoがさっぱり効かなくなるのだが、 Psycoを使うより速くなるので、まあいいとします。

そもそもの問題はコピーを作らずに比較できないことなのだけれど、 Python 3000ではsubstringを共有させるようにしようとか、そういう話も持ち上がっていたみたい。結局どうなるかは覚えてない。

[ツッコミを入れる]

[]