天泣記

さて、subexp call や Balancing Group Definition で括弧の対にマッチできるからといって、その木構造を取り出せないのではおもしろくない。

取り出せるようにしてみよう。

subexp call は、いってしまえば recursive decent parser なので、呼び出し関係にそった木を作ればいい。

[:tree, tag, r1, r2, ...] と [:leaf, r1, r2, ...] でやってみよう。前者で [tag, ...] というノードを作り、後者でマッチした場所をキャプチャした文字列を葉とする。

実装としてはスタックが必要だが、(.NET 式の配列な) キャプチャを使えばいいだろう。ひとつあればいいので、:T という名前のキャプチャを使うことにしよう。(いちおう、:tree と :leaf 以外で変にいじられないことは仮定しておこう)

require 'pp'

def match(re, str, b=0)
  ary = str.split(//)
  try(re, ary, b, Hash.new([].freeze), collect_subexps(re)) {|e, cap|
    return [get_substr(ary, b...e), cap]
  }
  nil
end

def collect_subexps(re, subexps={})
  case re[0]
  when :fail
  when :lit
  when :cat; _, *rs = re; rs.each {|r| collect_subexps(r, subexps) }
  when :alt; _, *rs = re; rs.each {|r| collect_subexps(r, subexps) }
  when :rep; _, *rs = re; rs.each {|r| collect_subexps(r, subexps) }
  when :repn; _, min, max, *rs = re; rs.each {|r| collect_subexps(r, subexps) }
  when :cap; _, n, *rs = re; rs.each {|r| collect_subexps(r, subexps) }
  when :subexp; _, n, *rs = re; rs.each {|r| collect_subexps(r, subexps) }
    subexps[n] = re
  when :call
  when :leaf; _, *rs = re; rs.each {|r| collect_subexps(r, subexps) }
  when :tree; _, tag, *rs = re; rs.each {|r| collect_subexps(r, subexps) }
  else raise "unexpected: #{re.inspect}"
  end
  subexps
end

def try(re, ary, pos, cap, subexps, &b)
  case re[0]
  when :fail
  when :lit; _, ch = re; try_lit(ch, ary, pos, cap, subexps, &b)
  when :cat; _, *rs = re; try_cat(rs, ary, pos, cap, subexps, &b)
  when :alt; _, *rs = re; try_alt(rs, ary, pos, cap, subexps, &b)
  when :rep; _, *rs = re; try_repn(0, nil, rs, ary, pos, cap, subexps, &b)
  when :repn; _, min, max, *rs = re; try_repn(min, max, rs, ary, pos, cap, subexps, &b)
  when :cap; _, n, *rs = re; try_cap(n, rs, ary, pos, cap, subexps, &b)
  when :subexp; _, n, *rs = re; try_subexp(n, rs, ary, pos, cap, subexps, &b)
  when :call; _, n = re; try_call(n, ary, pos, cap, subexps, &b)
  when :leaf; _, *rs = re; try_leaf(rs, ary, pos, cap, subexps, &b)
  when :tree; _, tag, *rs = re; try_tree(tag, rs, ary, pos, cap, subexps, &b)
  else raise "unexpected: #{re.inspect}"
  end
end

def try_lit(ch, ary, pos, cap, subexps)
  if pos < ary.length && ary[pos] == ch
    yield pos + 1, cap
  end
end

# r1 r2 ...
def try_cat(rs, ary, pos, cap, subexps, &block)
  if rs.empty?
    yield pos, cap
  else
    r, *rest = rs
    try(r, ary, pos, cap, subexps) {|pos2, cap2|
      try_cat(rest, ary, pos2, cap2, subexps, &block)
    }
  end
end

# r1 | r2 | ...
def try_alt(rs, ary, pos, cap, subexps, &block)
  rs.each {|r|
    try(r, ary, pos, cap, subexps, &block)
  }
end

# (r1 r2 ...)*          # min=0, max=nil
# (r1 r2 ...)+          # min=1, max=nil
# (r1 r2 ...){min,max}
def try_repn(min, max, rs, ary, pos, cap, subexps, &block)
  if max.nil? || 0 < max
    try_cat(rs, ary, pos, cap, subexps) {|pos2, cap2|
      if pos < pos2
        min2 = min == 0 ? 0 : (min-1)
        max2 = max ? (max-1) : nil
        try_repn(min2, max2, rs, ary, pos2, cap2, subexps, &block)
      end
    }
  end
  if min == 0
    yield pos, cap
  end
end

# (?<n>r1 r2 ...)
def try_cap(n, rs, ary, pos, cap, subexps, &block)
  try_cat(rs, ary, pos, cap, subexps) {|pos2, cap2|
    cap3 = cap2.dup
    cap3[n] = cap3[n] + [get_substr(ary, pos...pos2)]
    yield pos2, cap3
  }
end

# (?<n>r1 r2 ...) without capture
def try_subexp(n, rs, ary, pos, cap, subexps, &block)
  try_cat(rs, ary, pos, cap, subexps, &block)
end

# \g<n>
def try_call(n, ary, pos, cap, subexps, &block)
  r = subexps.fetch(n)
  try(r, ary, pos, cap, subexps, &block)
end

def try_leaf(rs, ary, pos, cap, subexps, &block)
  try_cap(:T, rs, ary, pos, cap, subexps, &block)
end

def try_tree(tag, rs, ary, pos, cap, subexps, &block)
  len = cap[:T].length
  try_cat(rs, ary, pos, cap, subexps) {|pos2, cap2|
    cap3 = cap2.dup
    cap3[:T] = cap3[:T][0,len] + [[tag, *cap3[:T][len..-1]]]
    yield pos2, cap3
  }
end

def get_substr(ary, range)
  s = ary[range].join('')
  s.instance_variable_set(:@pos, range)
  s
end

re = [:subexp, :S, [:rep, [:tree, :S, [:lit, "("], [:call, :S], [:lit, ")"]]]]
p match(re, "((())()))))")
#=> ["((())())", {:T=>[[:S, [:S, [:S]], [:S]]]}]

とりあえず対になった括弧をやってみると、((())()) が [:S, [:S, [:S]], [:S]] となり、ちゃんと parse できているようだ。

もうちょっと複雑なものとして、(基本的な) 正規表現を parse してみよう。

re = [:alt, [:cat, [:fail],
                   [:subexp, :R, [:alt, [:tree, :ALT, [:call, :T], [:repn, 1, nil, [:lit, "|"], [:call, :T]]],
                                        [:call, :T]]],
                   [:subexp, :T, [:alt, [:tree, :CAT, [:repn, 2, nil, [:call, :C]]],
                                        [:call, :C]]],
                   [:subexp, :C, [:alt, [:tree, :REP, [:cat, [:call, :V], [:lit, "*"]]],
                                        [:call, :V]]],
                   [:subexp, :V, [:alt, [:tree, :LIT, [:leaf, [:alt, [:lit, "a"],
                                                                     [:lit, "b"]]]],
                                        [:cat, [:lit, "("], [:call, :R], [:lit, ")"]]]]],
            [:call, :R]]
pp match(re, "a(bb(ab)*aa|a)*")
#=>
#["a(bb(ab)*aa|a)*",
# {:T=>
#   [[:CAT,
#     [:LIT, "a"],
#     [:REP,
#      [:ALT,
#       [:CAT,
#        [:LIT, "b"],
#        [:LIT, "b"],
#        [:REP, [:CAT, [:LIT, "a"], [:LIT, "b"]]],
#        [:LIT, "a"],
#        [:LIT, "a"]],
#       [:LIT, "a"]]]]]}]

ちゃんと parse できている。(混乱を防ぐため、parse 結果の木では tag を大文字にしてあるが、小文字にしてもよい)

recursive decent parser なので、あたりまえにできることができているだけのことではあるが。

では、Balancing Group Definition で木を取り出せるか。

なかなか難しい?

とりあえずスタックを複数使えるため、自明な木構造が定義できないのは厄介だ。

ひとつ指定して使うと限定すれば?

天泣記

2011-02-02 (Wed)

2011-02-03 (Thu)

2011-02-06 (Sun)

2011-02-20 (Sun)