1. 問題描述
馬爾可夫鏈算法用于生成一段隨機的英文,其思想非常簡單。首先讀入數據,然后將讀入的數據分成前綴和后綴兩部分,通過前綴來隨機獲取后綴,籍此產生一段可讀的隨機英文。
為了說明方便,假設我們有如下一段話:
下面是處理過的數據:
設置 w1 和 w2 為文本的前兩個詞
輸出 w1 和 w2
循環:
隨機地選出 w3,它是文本中 w1 w2 的后綴中的一個
打印 w3
把 w1 和 w2 分別換成 w2 和 w3
重復循環
2.awk 程序
馬爾可夫鏈算法并不難,我們會在后面看到,用c語言來解決這個問題會相當麻煩,而用awk則只需要5分鐘就能搞定。這簡直就是一個演示awk優點的問題。
awk 中有關聯數組,正好可以用來表示前綴和后綴的關系。程序如下:
# markov.awk: markov chain algorithm for 2-word prefixesBEGIN { MAXGEN = 10000; NONWORD = "/n"; w1 = w2 = NONWORD }{ for (i = 1; i <= NF; i++) { # read all words statetab[w1,w2,++nsuffix[w1,w2]] = $i w1 = w2 w2 = $i }}END { statetab[w1,w2,++nsuffix[w1,w2]] = NONWORD # add tail w1 = w2 = NONWORD for (i = 0; i < MAXGEN; i++) { # generate r = int(rand()*nsuffix[w1,w2]) + 1 # nsuffix >= 1 p = statetab[w1,w2,r] if (p == NONWORD) exit print p w1 = w2 # advance chain w2 = p }}
3. C++ 程序
該問題的主要難點就在于通過前綴隨機的獲取后綴,在C++ 中,我們可以借助map 來實現前綴和后綴的對應關系,以此得到較高的開發效率。
/* Copyright (C) 1999 Lucent Technologies *//* Excerpted from 'The Practice of Programming' *//* by Brian W. Kernighan and Rob Pike */#include <time.h>#include <iostream>#include <string>#include <deque>#include <map>#include <vector>using namespace std;const int NPREF = 2;const char NONWORD[] = "/n"; // cannot appear as real line: we remove newlinesconst int MAXGEN = 10000; // maximum words generatedtypedef deque<string> Prefix;map<Prefix, vector<string> > statetab; // prefix -> suffixesvoid build(Prefix&, istream&);void generate(int nwords);void add(Prefix&, const string&);// markov main: markov-chain random text generationint main(void){ int nwords = MAXGEN; Prefix prefix; // current input prefix srand(time(NULL)); for (int i = 0; i < NPREF; i++) add(prefix, NONWORD); build(prefix, cin); add(prefix, NONWORD); generate(nwords); return 0;}// build: read input words, build state tablevoid build(Prefix& prefix, istream& in){ string buf; while (in >> buf) add(prefix, buf);}// add: add word to suffix deque, update prefixvoid add(Prefix& prefix, const string& s){ if (prefix.size() == NPREF) { statetab[prefix].push_back(s); prefix.pop_front(); } prefix.push_back(s);}// generate: produce output, one word per linevoid generate(int nwords){ Prefix prefix; int i; for (i = 0; i < NPREF; i++) add(prefix, NONWORD); for (i = 0; i < nwords; i++) { vector<string>& suf = statetab[prefix]; const string& w = suf[rand() % suf.size()]; if (w == NONWORD) break; cout << w << "/n"; prefix.pop_front(); // advance prefix.push_back(w); }}
4. c 程序
如果需要程序運行得足夠快,那就只能用較低級的語言來實現了。當我們用c 語言來實現時,就不得不考慮各種各樣的問題了。首先,面臨的第一個問題就是,如何表示前綴和后綴的關系?
這里采用前綴的key,后綴為value 的方式存儲前綴與后綴的關系,我們知道,hash表的查找速度最快,所以,這里采用hash表也是情理之中的事,只是看你能不能想到,用前綴作key,基于上面的思路,再仔細一點,就沒有什么大問題了。
/* Copyright (C) 1999 Lucent Technologies *//* Excerpted from 'The Practice of Programming' *//* by Brian W. Kernighan and Rob Pike *//* * Markov chain random text generator. */#include <string.h>#include <stdlib.h>#include <stdio.h>#include <string.h>#include <time.h>#include "eprintf.h"enum { NPREF = 2, /* number of prefix words */ NHASH = 4093, /* size of state hash table array */ MAXGEN = 10000 /* maximum words generated */};typedef struct State State;typedef struct Suffix Suffix;struct State { /* prefix + suffix list */ char *pref[NPREF]; /* prefix words */ Suffix *suf; /* list of suffixes */ State *next; /* next in hash table */};struct Suffix { /* list of suffixes */ char *word; /* suffix */ Suffix *next; /* next in list of suffixes */};State *lookup(char *prefix[], int create);void build(char *prefix[], FILE*);void generate(int nwords);void add(char *prefix[], char *word);State *statetab[NHASH]; /* hash table of states */char NONWORD[] = "/n"; /* cannot appear as real word *//* markov main: markov-chain random text generation */int main(void){ int i, nwords = MAXGEN; char *prefix[NPREF]; /* current input prefix */ int c; long seed; setprogname("markov"); seed = time(NULL); srand(seed); for (i = 0; i < NPREF; i++) /* set up initial prefix */ prefix[i] = NONWORD; build(prefix, stdin); add(prefix, NONWORD); generate(nwords); return 0;} const int MULTIPLIER = 31; /* for hash() *//* hash: compute hash value for array of NPREF strings */unsigned int hash(char *s[NPREF]){ unsigned int h; unsigned char *p; int i; h = 0; for (i = 0; i < NPREF; i++) for (p = (unsigned char *) s[i]; *p != '/0'; p++) h = MULTIPLIER * h + *p; return h % NHASH;}/* lookup: search for prefix; create if requested. *//* returns pointer if present or created; NULL if not. *//* creation doesn't strdup so strings mustn't change later. */State* lookup(char *prefix[NPREF], int create){ int i, h; State *sp; h = hash(prefix); for (sp = statetab[h]; sp != NULL; sp = sp->next) { for (i = 0; i < NPREF; i++) if (strcmp(prefix[i], sp->pref[i]) != 0) break; if (i == NPREF) /* found it */ return sp; } if (create) { sp = (State *) emalloc(sizeof(State)); for (i = 0; i < NPREF; i++) sp->pref[i] = prefix[i]; sp->suf = NULL; sp->next = statetab[h]; statetab[h] = sp; } return sp;}/* addsuffix: add to state. suffix must not change later */void addsuffix(State *sp, char *suffix){ Suffix *suf; suf = (Suffix *) emalloc(sizeof(Suffix)); suf->word = suffix; suf->next = sp->suf; sp->suf = suf;}/* add: add word to suffix list, update prefix */void add(char *prefix[NPREF], char *suffix){ State *sp; sp = lookup(prefix, 1); /* create if not found */ addsuffix(sp, suffix); /* move the words down the prefix */ memmove(prefix, prefix+1, (NPREF-1)*sizeof(prefix[0])); prefix[NPREF-1] = suffix;}/* build: read input, build prefix table */void build(char *prefix[NPREF], FILE *f){ char buf[100], fmt[10]; /* create a format string; %s could overflow buf */ sprintf(fmt, "%%%ds", sizeof(buf)-1); while (fscanf(f, fmt, buf) != EOF) add(prefix, estrdup(buf));}/* generate: produce output, one word per line */void generate(int nwords){ State *sp; Suffix *suf; char *prefix[NPREF], *w; int i, nmatch; for (i = 0; i < NPREF; i++) /* reset initial prefix */ prefix[i] = NONWORD; for (i = 0; i < nwords; i++) { sp = lookup(prefix, 0); if (sp == NULL) eprintf("internal error: lookup failed"); nmatch = 0; for (suf = sp->suf; suf != NULL; suf = suf->next) if (rand() % ++nmatch == 0) /* prob = 1/nmatch */ w = suf->word; if (nmatch == 0) eprintf("internal error: no suffix %d %s", i, prefix[0]); if (strcmp(w, NONWORD) == 0) break; printf("%s/n", w); memmove(prefix, prefix+1, (NPREF-1)*sizeof(prefix[0])); prefix[NPREF-1] = w; }}
新聞熱點
疑難解答
圖片精選