亚洲香蕉成人av网站在线观看_欧美精品成人91久久久久久久_久久久久久久久久久亚洲_热久久视久久精品18亚洲精品_国产精自产拍久久久久久_亚洲色图国产精品_91精品国产网站_中文字幕欧美日韩精品_国产精品久久久久久亚洲调教_国产精品久久一区_性夜试看影院91社区_97在线观看视频国产_68精品久久久久久欧美_欧美精品在线观看_国产精品一区二区久久精品_欧美老女人bb

首頁 > 編程 > Java > 正文

java使用Nagao算法實現新詞發現、熱門詞的挖掘

2019-11-26 15:03:19
字體:
來源:轉載
供稿:網友

采用Nagao算法統計各個子字符串的頻次,然后基于這些頻次統計每個字符串的詞頻、左右鄰個數、左右熵、交互信息(內部凝聚度)。

名詞解釋:

  Nagao算法:一種快速的統計文本里所有子字符串頻次的算法。詳細算法可見http://www.doc88.com/p-664123446503.html
  詞頻:該字符串在文檔中出現的次數。出現次數越多越重要。
  左右鄰個數:文檔中該字符串的左邊和右邊出現的不同的字的個數。左右鄰越多,說明字符串成詞概率越高。
  左右熵:文檔中該字符串的左邊和右邊出現的不同的字的數量分布的熵。類似上面的指標,有一定區別。
  交互信息:每次將某字符串分成兩部分,左半部分字符串和右半部分字符串,計算其同時出現的概率除于其各自獨立出現的概率,最后取所有的劃分里面概率最小值。這個值越大,說明字符串內部凝聚度越高,越可能成詞。

算法具體流程:

1.  將輸入文件逐行讀入,按照非漢字([^/u4E00-/u9FA5]+)以及停詞“的很了么呢是嘛個都也比還這于不與才上用就好在和對挺去后沒說”,
分成一個個字符串,代碼如下:
String[] phrases = line.split("[^/u4E00-/u9FA5]+|["+stopwords+"]");
停用詞可以修改。
2.  獲取所有切分后的字符串的左子串和右子串,分別加入左、右PTable
3.  對PTable排序,并計算LTable。LTable記錄的是,排序后的PTable中,下一個子串同上一個子串具有相同字符的數量
4.  遍歷PTable和LTable,即可得到所有子字符串的詞頻、左右鄰
5.  根據所有子字符串的詞頻、左右鄰結果,輸出字符串的詞頻、左右鄰個數、左右熵、交互信息

1.  NagaoAlgorithm.java

package com.algo.word; import java.io.BufferedReader;import java.io.BufferedWriter;import java.io.FileNotFoundException;import java.io.FileReader;import java.io.FileWriter;import java.io.IOException;import java.util.ArrayList;import java.util.Arrays;import java.util.Collections;import java.util.HashMap;import java.util.HashSet;import java.util.List;import java.util.Map;import java.util.Set; public class NagaoAlgorithm {     private int N;     private List<String> leftPTable;  private int[] leftLTable;  private List<String> rightPTable;  private int[] rightLTable;  private double wordNumber;     private Map<String, TFNeighbor> wordTFNeighbor;     private final static String stopwords = "的很了么呢是嘛個都也比還這于不與才上用就好在和對挺去后沒說";     private NagaoAlgorithm(){    //default N = 5    N = 5;    leftPTable = new ArrayList<String>();    rightPTable = new ArrayList<String>();    wordTFNeighbor = new HashMap<String, TFNeighbor>();  }  //reverse phrase  private String reverse(String phrase) {    StringBuilder reversePhrase = new StringBuilder();    for (int i = phrase.length() - 1; i >= 0; i--)      reversePhrase.append(phrase.charAt(i));    return reversePhrase.toString();  }  //co-prefix length of s1 and s2  private int coPrefixLength(String s1, String s2){    int coPrefixLength = 0;    for(int i = 0; i < Math.min(s1.length(), s2.length()); i++){      if(s1.charAt(i) == s2.charAt(i))  coPrefixLength++;      else break;    }    return coPrefixLength;  }  //add substring of line to pTable  private void addToPTable(String line){    //split line according to consecutive none Chinese character    String[] phrases = line.split("[^/u4E00-/u9FA5]+|["+stopwords+"]");    for(String phrase : phrases){      for(int i = 0; i < phrase.length(); i++)        rightPTable.add(phrase.substring(i));      String reversePhrase = reverse(phrase);      for(int i = 0; i < reversePhrase.length(); i++)        leftPTable.add(reversePhrase.substring(i));      wordNumber += phrase.length();    }  }     //count lTable  private void countLTable(){    Collections.sort(rightPTable);    rightLTable = new int[rightPTable.size()];    for(int i = 1; i < rightPTable.size(); i++)      rightLTable[i] = coPrefixLength(rightPTable.get(i-1), rightPTable.get(i));         Collections.sort(leftPTable);    leftLTable = new int[leftPTable.size()];    for(int i = 1; i < leftPTable.size(); i++)      leftLTable[i] = coPrefixLength(leftPTable.get(i-1), leftPTable.get(i));         System.out.println("Info: [Nagao Algorithm Step 2]: having sorted PTable and counted left and right LTable");  }  //according to pTable and lTable, count statistical result: TF, neighbor distribution  private void countTFNeighbor(){    //get TF and right neighbor    for(int pIndex = 0; pIndex < rightPTable.size(); pIndex++){      String phrase = rightPTable.get(pIndex);      for(int length = 1 + rightLTable[pIndex]; length <= N && length <= phrase.length(); length++){        String word = phrase.substring(0, length);        TFNeighbor tfNeighbor = new TFNeighbor();        tfNeighbor.incrementTF();        if(phrase.length() > length)          tfNeighbor.addToRightNeighbor(phrase.charAt(length));        for(int lIndex = pIndex+1; lIndex < rightLTable.length; lIndex++){          if(rightLTable[lIndex] >= length){            tfNeighbor.incrementTF();            String coPhrase = rightPTable.get(lIndex);            if(coPhrase.length() > length)              tfNeighbor.addToRightNeighbor(coPhrase.charAt(length));          }          else break;        }        wordTFNeighbor.put(word, tfNeighbor);      }    }    //get left neighbor    for(int pIndex = 0; pIndex < leftPTable.size(); pIndex++){      String phrase = leftPTable.get(pIndex);      for(int length = 1 + leftLTable[pIndex]; length <= N && length <= phrase.length(); length++){        String word = reverse(phrase.substring(0, length));        TFNeighbor tfNeighbor = wordTFNeighbor.get(word);        if(phrase.length() > length)          tfNeighbor.addToLeftNeighbor(phrase.charAt(length));        for(int lIndex = pIndex + 1; lIndex < leftLTable.length; lIndex++){          if(leftLTable[lIndex] >= length){            String coPhrase = leftPTable.get(lIndex);            if(coPhrase.length() > length)              tfNeighbor.addToLeftNeighbor(coPhrase.charAt(length));          }          else break;        }      }    }    System.out.println("Info: [Nagao Algorithm Step 3]: having counted TF and Neighbor");  }  //according to wordTFNeighbor, count MI of word  private double countMI(String word){    if(word.length() <= 1)  return 0;    double coProbability = wordTFNeighbor.get(word).getTF()/wordNumber;    List<Double> mi = new ArrayList<Double>(word.length());    for(int pos = 1; pos < word.length(); pos++){      String leftPart = word.substring(0, pos);      String rightPart = word.substring(pos);      double leftProbability = wordTFNeighbor.get(leftPart).getTF()/wordNumber;      double rightProbability = wordTFNeighbor.get(rightPart).getTF()/wordNumber;      mi.add(coProbability/(leftProbability*rightProbability));    }    return Collections.min(mi);  }  //save TF, (left and right) neighbor number, neighbor entropy, mutual information  private void saveTFNeighborInfoMI(String out, String stopList, String[] threshold){    try {      //read stop words file      Set<String> stopWords = new HashSet<String>();      BufferedReader br = new BufferedReader(new FileReader(stopList));      String line;      while((line = br.readLine()) != null){        if(line.length() > 1)          stopWords.add(line);      }      br.close();      //output words TF, neighbor info, MI      BufferedWriter bw = new BufferedWriter(new FileWriter(out));      for(Map.Entry<String, TFNeighbor> entry : wordTFNeighbor.entrySet()){        if( entry.getKey().length() <= 1 || stopWords.contains(entry.getKey()) ) continue;        TFNeighbor tfNeighbor = entry.getValue();                          int tf, leftNeighborNumber, rightNeighborNumber;        double mi;        tf = tfNeighbor.getTF();        leftNeighborNumber = tfNeighbor.getLeftNeighborNumber();        rightNeighborNumber = tfNeighbor.getRightNeighborNumber();        mi = countMI(entry.getKey());        if(tf > Integer.parseInt(threshold[0]) && leftNeighborNumber > Integer.parseInt(threshold[1]) &&             rightNeighborNumber > Integer.parseInt(threshold[2]) && mi > Integer.parseInt(threshold[3]) ){          StringBuilder sb = new StringBuilder();          sb.append(entry.getKey());          sb.append(",").append(tf);          sb.append(",").append(leftNeighborNumber);          sb.append(",").append(rightNeighborNumber);          sb.append(",").append(tfNeighbor.getLeftNeighborEntropy());          sb.append(",").append(tfNeighbor.getRightNeighborEntropy());          sb.append(",").append(mi).append("/n");          bw.write(sb.toString());        }      }      bw.close();    } catch (IOException e) {      throw new RuntimeException(e);    }    System.out.println("Info: [Nagao Algorithm Step 4]: having saved to file");  }  //apply nagao algorithm to input file  public static void applyNagao(String[] inputs, String out, String stopList){    NagaoAlgorithm nagao = new NagaoAlgorithm();    //step 1: add phrases to PTable    String line;    for(String in : inputs){      try {        BufferedReader br = new BufferedReader(new FileReader(in));        while((line = br.readLine()) != null){          nagao.addToPTable(line);        }        br.close();      } catch (IOException e) {        throw new RuntimeException();      }    }    System.out.println("Info: [Nagao Algorithm Step 1]: having added all left and right substrings to PTable");    //step 2: sort PTable and count LTable    nagao.countLTable();    //step3: count TF and Neighbor    nagao.countTFNeighbor();    //step4: save TF NeighborInfo and MI    nagao.saveTFNeighborInfoMI(out, stopList, "20,3,3,5".split(","));  }  public static void applyNagao(String[] inputs, String out, String stopList, int n, String filter){    NagaoAlgorithm nagao = new NagaoAlgorithm();    nagao.setN(n);    String[] threshold = filter.split(",");    if(threshold.length != 4){      System.out.println("ERROR: filter must have 4 numbers, seperated with ',' ");      return;    }    //step 1: add phrases to PTable    String line;    for(String in : inputs){      try {        BufferedReader br = new BufferedReader(new FileReader(in));        while((line = br.readLine()) != null){          nagao.addToPTable(line);        }        br.close();      } catch (IOException e) {        throw new RuntimeException();      }    }    System.out.println("Info: [Nagao Algorithm Step 1]: having added all left and right substrings to PTable");    //step 2: sort PTable and count LTable    nagao.countLTable();    //step3: count TF and Neighbor    nagao.countTFNeighbor();    //step4: save TF NeighborInfo and MI    nagao.saveTFNeighborInfoMI(out, stopList, threshold);  }  private void setN(int n){    N = n;  }     public static void main(String[] args) {    String[] ins = {"E://test//ganfen.txt"};    applyNagao(ins, "E://test//out.txt", "E://test//stoplist.txt");  } }

2. TFNeighbor.java

package com.algo.word; import java.util.HashMap;import java.util.Map; public class TFNeighbor {   private int tf;  private Map<Character, Integer> leftNeighbor;  private Map<Character, Integer> rightNeighbor;     TFNeighbor(){    leftNeighbor = new HashMap<Character, Integer>();    rightNeighbor = new HashMap<Character, Integer>();  }  //add word to leftNeighbor  public void addToLeftNeighbor(char word){    //leftNeighbor.put(word, 1 + leftNeighbor.getOrDefault(word, 0));    Integer number = leftNeighbor.get(word);    leftNeighbor.put(word, number == null? 1: 1+number);  }  //add word to rightNeighbor  public void addToRightNeighbor(char word){    //rightNeighbor.put(word, 1 + rightNeighbor.getOrDefault(word, 0));    Integer number = rightNeighbor.get(word);    rightNeighbor.put(word, number == null? 1: 1+number);  }  //increment tf  public void incrementTF(){    tf++;  }  public int getLeftNeighborNumber(){    return leftNeighbor.size();  }  public int getRightNeighborNumber(){    return rightNeighbor.size();  }  public double getLeftNeighborEntropy(){    double entropy = 0;    int sum = 0;    for(int number : leftNeighbor.values()){      entropy += number*Math.log(number);      sum += number;    }    if(sum == 0)  return 0;    return Math.log(sum) - entropy/sum;  }  public double getRightNeighborEntropy(){    double entropy = 0;    int sum = 0;    for(int number : rightNeighbor.values()){      entropy += number*Math.log(number);      sum += number;    }    if(sum == 0)  return 0;    return Math.log(sum) - entropy/sum;  }  public int getTF(){    return tf;  }}

3. Main.java

package com.algo.word; public class Main {   public static void main(String[] args) {         //if 3 arguments, first argument is input files splitting with ','    //second argument is output file    //output 7 columns split with ',' , like below:    //word, term frequency, left neighbor number, right neighbor number, left neighbor entropy, right neighbor entropy, mutual information    //third argument is stop words list    if(args.length == 3)      NagaoAlgorithm.applyNagao(args[0].split(","), args[1], args[2]);         //if 4 arguments, forth argument is the NGram parameter N    //5th argument is threshold of output words, default is "20,3,3,5"    //output TF > 20 && (left | right) neighbor number > 3 && MI > 5    else if(args.length == 5)      NagaoAlgorithm.applyNagao(args[0].split(","), args[1], args[2], Integer.parseInt(args[3]), args[4]);            } }

以上所述就是本文的全部內容了,希望大家能夠喜歡。

發表評論 共有條評論
用戶名: 密碼:
驗證碼: 匿名發表
亚洲香蕉成人av网站在线观看_欧美精品成人91久久久久久久_久久久久久久久久久亚洲_热久久视久久精品18亚洲精品_国产精自产拍久久久久久_亚洲色图国产精品_91精品国产网站_中文字幕欧美日韩精品_国产精品久久久久久亚洲调教_国产精品久久一区_性夜试看影院91社区_97在线观看视频国产_68精品久久久久久欧美_欧美精品在线观看_国产精品一区二区久久精品_欧美老女人bb
亚洲91精品在线观看| 最近中文字幕mv在线一区二区三区四区| 国产一区二区日韩| 91av视频在线观看| 欧美激情亚洲精品| 欧美日韩国产中文字幕| 欧美福利在线观看| 国产在线999| 日韩免费av一区二区| 久久精品国产综合| 国产日韩中文字幕在线| 国产精品自产拍在线观看中文| 亚洲第一男人天堂| 欧美激情在线狂野欧美精品| 欧美富婆性猛交| 日韩视频在线观看免费| 日韩欧美国产中文字幕| 午夜精品久久久久久99热软件| 欧美国产日韩一区二区在线观看| 久久精品国产清自在天天线| 亚洲精品欧美日韩专区| 九九热视频这里只有精品| 欧美极品美女视频网站在线观看免费| 亚洲a级在线观看| www.午夜精品| 欧美区二区三区| 国产视频精品xxxx| 欧美成人精品影院| 国产一区二区香蕉| 欧美诱惑福利视频| 精品久久久精品| 日韩欧中文字幕| 国产日韩欧美在线看| 国产精品网红福利| 亚洲free性xxxx护士hd| 久久久国产精品一区| 久久免费国产精品1| 欧美大肥婆大肥bbbbb| 色噜噜亚洲精品中文字幕| 欧美夜福利tv在线| 亚洲精品720p| www.亚洲一区| 精品国产一区二区三区久久久| 亚洲免费成人av电影| 国产91精品久久久| 久久久91精品国产一区不卡| 日韩在线视频观看| 亚洲欧美精品在线| 欧美wwwxxxx| 国产成人综合精品在线| 亚洲已满18点击进入在线看片| 7777kkkk成人观看| 成人国产精品日本在线| 久久天天躁狠狠躁夜夜躁| 自拍亚洲一区欧美另类| 尤物yw午夜国产精品视频明星| 琪琪第一精品导航| 亚洲黄页视频免费观看| 国产精品视频公开费视频| 亚洲天堂视频在线观看| 久久久99久久精品女同性| 国产自产女人91一区在线观看| 中文字幕精品久久| 欧美性xxxx极品hd满灌| 国产欧美久久一区二区| 国产成人免费av| 国产精品久久久久久久久久小说| 国产午夜精品视频| 中文字幕精品av| 欧美网站在线观看| 欧美成人剧情片在线观看| 国产999精品久久久影片官网| 91亚洲永久免费精品| 精品夜色国产国偷在线| 欧美丰满少妇xxxx| 亚洲欧美精品一区二区| 国产极品jizzhd欧美| 国产精品video| 亚洲男女性事视频| 国产精品福利观看| 欧美日韩国产999| 日韩av在线免费看| 成人在线国产精品| 精品久久久久久久久国产字幕| 国内伊人久久久久久网站视频| 亚洲图片在区色| 欧美在线中文字幕| 亚洲一区二区三区四区在线播放| 日韩精品有码在线观看| 精品日韩视频在线观看| 狠狠操狠狠色综合网| 国产丝袜精品视频| 亚洲女人天堂色在线7777| 久久综合久久美利坚合众国| 久久久久九九九九| 一区二区在线视频| 精品久久久视频| 国产精品对白刺激| 欧美猛交ⅹxxx乱大交视频| 欧美性一区二区三区| 亚洲激情久久久| 国产日韩欧美黄色| 精品无人区太爽高潮在线播放| 国产精品视频久| 亚洲人成电影在线播放| 日韩在线资源网| 日韩亚洲欧美成人| 在线播放国产精品| 国产91精品网站| 午夜欧美大片免费观看| 4438全国亚洲精品在线观看视频| 国产99久久精品一区二区 夜夜躁日日躁| 美乳少妇欧美精品| 亚洲第一福利网站| 国产精品99蜜臀久久不卡二区| 精品福利视频导航| 久久综合久久八八| 亚洲精品按摩视频| 2021国产精品视频| 日韩中文字幕在线观看| 久久精品小视频| 精品调教chinesegay| 最好看的2019年中文视频| 久久免费精品日本久久中文字幕| 欧美日韩国产专区| 亚洲人高潮女人毛茸茸| 亚洲丁香婷深爱综合| 亚洲精品永久免费精品| 国产精品久久一区| 久久久精品国产| 国产在线98福利播放视频| 日韩一区二区三区在线播放| 欧美大片欧美激情性色a∨久久| 午夜精品福利电影| 精品久久久一区二区| 日韩精品亚洲元码| 国内精品久久久久久中文字幕| 欧美激情xxxx| 欧美激情二区三区| 亚洲视频第一页| 国产精品黄页免费高清在线观看| 久久手机精品视频| 日韩在线观看免费高清| 亚洲精品在线视频| 欧美成人四级hd版| 亚洲韩国青草视频| 久久精品99久久久久久久久| 日韩精品视频在线观看网址| 精品视频在线播放| 久久男人的天堂| 欧美黑人视频一区| 狠狠久久亚洲欧美专区| 成人免费在线视频网站| 中文国产成人精品| 亚洲欧美一区二区精品久久久| 成人免费视频在线观看超级碰| 中文字幕亚洲综合| 亚洲毛片一区二区| 不卡在线观看电视剧完整版| 国内精品国产三级国产在线专| 欧美剧在线观看| 97视频国产在线| 日韩欧美在线视频观看| 亚洲va欧美va国产综合剧情|