From Gossip@caterpillar

Algorithm Gossip: 字串核對

 說明

今日的一些高階程式語言對於字串的處理支援越來越強大(例如Java、Perl等),不過字串搜尋本身仍是個值得 探討的課題,在這邊以Boyer- Moore法來說明如何進行字串說明,這個方法快且原理簡潔易懂。

解法

字串搜尋本身不難,使用暴力法也可以求解,但如何快速搜尋字串就不簡單了,傳統的字串搜尋是從關鍵字與字串的開頭 開始比對,例如 Knuth-Morris-Pratt 演算法 字串搜尋,這個方法也不錯,不過要花時間在公式計算上;Boyer-Moore字串核對改由關鍵字的後面開始核對字串,並製作前進表,如果比對不符合則依 前進表中的值前進至下一個核對處,假設是p好了,然後比對字串中p-n+1至p的值是否與關鍵字相同。
字串核對

那麼前進表該如何前進,舉個實際的例子,如果要在字串中搜尋JUST這個字串,則可能遇到的幾個情況如下所示:
字串核對


依照這個例子,可以決定出我們的前進值表如下:
其它 J U S T
4 3 2 1 4(match?)

如果關鍵字中有重複出現的字元,則前進值就會有兩個以上的值,此時則取前進值較小的值,如此就不會跳過可能的位置,例如texture這個關鍵字,t的前 進值應該取後面的3而不是取前面的7。

實作:C    Java    Python    Scala    Ruby

  • C
#include <stdio.h> 
#include <stdlib.h>
#include <string.h>
#define SKIP_TABLE_SIZE 256
#define STRING_LENGTH 80

void table(int*, char*); // 建立前進表
int search(int* skip, int, char*, char*); // 搜尋關鍵字
void substring(char*, char*, int, int); // 取出子字串

int main(void) {
int skip[SKIP_TABLE_SIZE];

char str_input[STRING_LENGTH];
char str_key[STRING_LENGTH];
char tmp[STRING_LENGTH] = {'\0'};

printf("請輸入字串:");
gets(str_input);
printf("請輸入搜尋關鍵字:");
gets(str_key);

int m = strlen(str_input); // 計算字串長度
int n = strlen(str_key);
table(skip, str_key);
int p = search(skip, n-1, str_input, str_key);

while(p != -1) {
substring(str_input, tmp, p, m);
printf("%s\n", tmp);
p = search(skip, p+n+1, str_input, str_key);
}

printf("\n");

return 0;
}

void table(int* skip, char *key) {
int n = strlen(key);
int k;
for(k = 0; k < SKIP_TABLE_SIZE; k++)
skip[k] = n;
for(k = 0; k < n - 1; k++)
skip[key[k]] = n - k - 1;
}

int search(int* skip, int p, char* input, char* key) {
char tmp[STRING_LENGTH] = {'\0'};

int m = strlen(input);
int n = strlen(key);

while(p < m) {
substring(input, tmp, p-n+1, p);

if(!strcmp(tmp, key)) // 比較兩字串是否相同
return p-n+1;
p += skip[input[p]];
}

return -1;
}

void substring(char *text, char* tmp, int s, int e) {
int i, j;
for(i = s, j = 0; i <= e; i++, j++)
tmp[j] = text[i];
tmp[j] = '\0';
}

  • Java
import java.util.*;
import java.io.*;

public class StringMatcher implements Iterable<String>, Iterator<String>{
private int[] skip = new int[256];
private String str;
private String key;
private int pos;

public StringMatcher(String str, String key) {
this.str = str;
this.key = key;
for(int k = 0; k < 256; k++)
skip[k] = key.length();
for(int k = 0; k < key.length() - 1; k++)
skip[key.charAt(k)] = key.length() - k - 1;
pos = search(key.length()-1, str, key);
}

public Iterator<String> iterator() { return this; }

private int search(int p, String str, String key) {
int tp = p;
while(tp < str.length()
&& ! str.substring(tp-key.length()+1, tp+1).equals(key)) {
tp += skip[str.charAt(tp)];
}
return tp-key.length()+1;
}

public boolean hasNext() {
return pos < str.length() - 1;
}

public String next() {
String tmp = str.substring(pos);
pos = search(pos+key.length()+1, str, key);
return tmp;
}

public void remove() { throw new RuntimeException("Not supported"); }

public static void main(String[] args) throws IOException {
BufferedReader bufReader =
new BufferedReader(
new InputStreamReader(System.in));

System.out.print("請輸入字串:");
String str = bufReader.readLine();
System.out.print("請輸入搜尋關鍵字:");
String key = bufReader.readLine();

for(String s : new StringMatcher(str, key)) {
System.out.println(s);
}
}
}

  • Python
class StringMatcher:
def __init__(self, str, key):
self.__str = str
self.__key = key
self.__skip = [len(key) for k in range(256)]
for k in range(len(key) - 1):
self.__skip[ord(key[k])] = len(key) -k - 1
self.__pos = self.__search(len(key), str, key)

def __iter__(self):
return self

def __search(self, p, input, key):
tp = p
while(tp < len(input) and input[tp-len(key) + 1: tp + 1] != key):
tp += self.__skip[ord(input[tp])]
return tp - len(key) + 1

def __next__(self):
if not self.__hasNext():
raise StopIteration
tmp = self.__str[self.__pos:]
self.__pos = self.__search(
self.__pos + len(self.__key) + 1, str, self.__key)
return tmp

def __hasNext(self):
return self.__pos < len(self.__str) - 1

str = input("請輸入字串:")
key = input("請輸入搜尋關鍵字:")
for s in StringMatcher(str, key):
print(s)

  • Scala
class StringMatcher(val str: String, 
val key: String) extends Iterator[String] {
private val skip = {
val table = (for(k <- 0 to 256) yield key.length).toArray
for(k <- 0 until key.length - 1) {
table(key.charAt(k)) = key.length - k - 1
}
table
}

private var pos = search(key.length - 1, str, key)

private def search(p: Int, str: String, key: String) = {
var tp = p
while(tp < str.length &&
str.substring(tp - key.length + 1, tp + 1) != key) {
tp += skip(str.charAt(tp))
}
tp - key.length + 1
}

def hasNext() = pos < str.length - 1

def next() = {
val tmp = str.substring(pos)
pos = search(pos + key.length + 1, str, key)
tmp
}
}

print("請輸入字串:")
val str = readLine
print("請輸入搜尋關鍵字:")
val key = readLine
for(s <- new StringMatcher(str, key)) {
println(s)
}

  • Ruby
# encoding: Big5
class StringMatcher
def initialize(str, key)
@str = str
@key = key
@skip = Array.new(256, key.length)
(@key.length - 1).times { |k|
@skip[key[k].ord] = key.length - k - 1
}
@pos = search(key.length, str, key)
end

def search(p, input, key)
tp = p
while tp < input.length && input[(tp - key.length + 1)..(tp)] != key
tp += @skip[input[tp].ord]
end
tp - key.length + 1
end
private :search

def each
while @pos < @str.length - 1
tmp = @str[@pos..-1]
@pos = search(@pos + @key.length, @str, @key)
yield(tmp)
end
end
end

print "請輸入字串:"
str = gets.chomp
print "請輸入搜尋關鍵字:"
key = gets.chomp
StringMatcher.new(str, key).each { |s|
puts s
}