Java應用開源框架實現簡易web搜索引擎

2024-07-13 10:13:35

字體：大中小

來源：轉載

供稿：網友

引言

應用 java/257964.html">Java 的開源庫，編寫一個搜索引擎，這個引擎能爬取一個網站的內容。并根據網頁內容進行深度爬取，獲取所有相關的網頁地址和內容，用戶可以通過關鍵詞，搜索所有相關的網址。

具體功能

(1) 用戶可以指定爬取一個url對應的網頁的內容。
(2) 對網頁內容進行解析，并獲取其中所有的url鏈接地址。
(3) 用戶可以設定爬取深度，代表著從初始url對應的頁面開始，可以爬取其中所有的url對應的網頁內的url，以此類推。深度越大，能爬取到的網站越多。
(4) 對爬取到的url內容進行保存、建立索引。建立索引的內容是url地址本身，和url對應的網頁標題。
(5) 用戶可以通過關鍵詞對網址進行搜索，找出有該關鍵詞的url地址。
(6) 建立索引和搜索索引的過程能智能識別中文關鍵詞，能對關鍵詞進行分詞操作。
(7) 用戶可以指定保存索引的地址、初始url、爬取深度、進行搜索的關鍵詞和最大匹配項。

開源框架

Lucene
Jsoup

源碼

爬蟲部分：Spider.java

package webCrawler.Spider;import java.io.IOException;import java.util.ArrayList;import java.util.HashSet;import java.util.Scanner;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;import webCrawler.Index.BuildIndex;/** * @author lannooo */public class Spider {  ArrayList<String> URLs;  private String startURL;  private int digLevel;  /**   * @param startURL 爬蟲的起始URL   * @param digLevel 爬取深度   */  public Spider(String startURL, int digLevel){    this.startURL = startURL;    this.digLevel = digLevel;    this.URLs = new ArrayList<>();  }  /**   * @param level 當前爬取的深度剩余   * @param arrayList 需要進行下一輪爬去的URL集   * @return 從一格url集爬取到的新的URL集   * @throws IOException   */  public ArrayList<String> getLevelURLs(int level, ArrayList<String> arrayList)       throws IOException{    ArrayList<String> total = null;    if(level>0){            total = new ArrayList<>();      for(String url: arrayList){        /*對于每個arrayList中的URL，首先解析其網頁內容，并獲得里面所有URL項*/        for(String each: getBareLinks(url)){          total.add(each);        }      }      /*用HashSet這個容器將total里面重復項刪除*/      HashSet<String> hashSet = new HashSet<>(total);      total = new ArrayList<>(hashSet);    }    return total;  }  /**   * 從startURL開始，爬取所有相關URLs   * @throws IOException   */  public void getAll() throws IOException{    ArrayList<String> newURLs;    ArrayList<String> currentURLs = new ArrayList<>();    /*把startURL加入currentURLs這個列表中，從這個url開始爬*/    currentURLs.add(startURL);    for(int i=digLevel; i>0; i--){      /*       * 對于每一層，都要獲取一次由這個url引申出去的url集       * 然后把當前集的已經爬去過的url加入到總的URL集中       * 最后newURLs作為新的需要進行深度爬取的集進入下一輪循環       */      System.out.println("Dig into level: " + (digLevel-i+1));      newURLs = getLevelURLs(i, currentURLs);      for(String each: currentURLs){        URLs.add(each);      }      currentURLs = newURLs;    }    for(String each:currentURLs){      URLs.add(each);    }    HashSet<String> hashSet = new HashSet<>(URLs);    URLs = new ArrayList<>(hashSet);  }  /**   * @param path 保存索引的路徑   * @throws IOException   */  public void storeURLsAndInfo(String path) throws IOException{    BuildIndex build = new BuildIndex(path);    /* 把URLs中的所有url進行實際網頁標題的爬取*/    for(String each:URLs){      String text = getLinkText(each);      if(text!=null){        build.addField("url", each);        build.addField("text", text);        /*將這一個entry加入索引中*/        build.pushIndex();      }    }    build.close();  }  /**   * @param url 需要獲取網頁標題的url   * @return 標題內容   * @throws IOException   */  public String getLinkText(String url) throws IOException{    Document document = null;    try {      /*用Jsoup進行連接，設置超時時間為3秒*/      document = Jsoup.connect(url).timeout(3000).get();    } catch (Exception e) {      System.out.println("[TIMEOUT]Get title of url:"+url);      return null;    }    String    return title;  }  /**   * @param url 進行內容解析的url   * @return 返回該url的網頁內容內的所有urls列表   * @throws IOException   */  public ArrayList<String> getBareLinks(String url) throws IOException{    ArrayList<String> linksList = new ArrayList<>();    Document document;    try {      document = Jsoup.connect(url).timeout(2000).get();    } catch (Exception e) {      return linksList;    }    /*獲取<body>標簽理的所有帶href屬性的<a>標簽*/    Elements links = document.select("body").select("a[href]");    for(Element link: links){      /*從每一個解析得到的<a>標簽中提取url，并去除錨點*/      String href = link.attr("abs:href").replaceAll("#", "");      /*只添加含有zju.edu.cn字符的url，去除末尾的'/'*/      if(href.contains("zju.edu.cn")){        if (href.endsWith("/")){          href = href.substring(0, href.length()-1);        }        linksList.add(href);      }    }    HashSet<String> hashSet = new HashSet<>(linksList);    ArrayList<String> arrayList = new ArrayList<>(hashSet);    return arrayList;  }  public static void main(String[] args) {    Scanner in = new Scanner(System.in);    System.out.println("Enter url:");    String url = in.nextLine().trim();    while(!url.startsWith("http://")){      System.out.println("http:// is needed!");      System.out.println("Enter url:");      url = in.nextLine().trim();    }    System.out.println("Enter depth to dig more urls[<=3 recommended]：");    int depth = in.nextInt();    Spider spider = new Spider(url, depth);    System.out.println("Enter path you want to save[default=d:/index-spider]:");    String path = in.nextLine().trim();    if(path.length()==0){      path = "d:/index-spider";    }    try {      System.out.println("Start fetching...");      spider.getAll();      System.out.println("Urls got success!");      spider.storeURLsAndInfo(path);      System.out.println("Stored success!");    } catch (IOException e) {      e.printStackTrace();    }  } }

建立索引：BuildIndex.java

package webCrawler.Index;import java.io.*;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.TextField;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.util.Version;import org.wltea.analyzer.lucene.IKAnalyzer;/** * @author lannooo * */public class BuildIndex {  private File file;  private Directory directory;  private IndexWriter indexWriter;  private IndexWriterConfig config;  private Analyzer analyzer;  private Document document;  /**   * @param path 建立索引的路徑   */  public BuildIndex(String path) {    try {      file = new File(path);      directory = FSDirectory.open(file);      document = new Document();      analyzer = new IKAnalyzer();    /*中文分詞工具類*/      config = new IndexWriterConfig(Version.LUCENE_4_10_0, analyzer);      indexWriter = new IndexWriter(directory, config);          } catch (Exception e) {      e.printStackTrace();    }  }  /**   * @param fieldName 加入到document中的新的一項的名稱   * @param fieldText 新的一項的內容   */  public void addField(String fieldName, String fieldText){    try{      Field field = new TextField(fieldName, fieldText, Field.Store.YES);      document.add(field);    }catch (Exception e) {      e.printStackTrace();    }  }  /**   * 將document加入到索引中   */  public void pushIndex(){    try {      indexWriter.addDocument(document);      document = new Document();    } catch (Exception e) {      e.printStackTrace();    }  }  /**   * 加入完整的一個document并保存到索引中   * @param url 加入的url地址   * @param text url對應的文本   */  public void addOneIndex(String url, String text){    this.addField("url", url);    this.addField("text", text);    this.pushIndex();  }  /**   * 關閉索引寫入   */  public void close(){    try {      indexWriter.close();    } catch (Exception e) {      e.printStackTrace();    }  }}

搜索索引

package webCrawler.Index;import java.io.File;import java.util.Scanner;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.document.Document;import org.apache.lucene.index.DirectoryReader;import org.apache.lucene.queryparser.classic.QueryParser;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TopDocs;import org.apache.lucene.store.FSDirectory;import org.wltea.analyzer.lucene.IKAnalyzer;/** * @author lannooo * */public class SearchIndex {  private IndexSearcher indexSearcher;  private Analyzer analyzer;  private QueryParser parser;  private Query query;  private TopDocs hits;  private DirectoryReader reader;  /**   * @param path 進行索引搜索的路徑   */  public SearchIndex(String path){    try {      reader = DirectoryReader.open(FSDirectory.open(new File(path)));      indexSearcher = new IndexSearcher(reader);      analyzer = new IKAnalyzer();    } catch (Exception e) {      e.printStackTrace();    }  }  /**   * @param fieldName 搜索的域名稱   * @param text 搜索的內容   * @param matchNumber 最大匹配項數   * @return 搜索到的最大匹配數   */  public int search(String fieldName, String text, int matchNumber){    try {      parser = new QueryParser(fieldName, analyzer);      query = parser.parse(text);      hits = indexSearcher.search(query, matchNumber);      return hits.totalHits;    } catch (Exception e) {      e.printStackTrace();    }    return -1;  }  /**   * 打印所有的匹配項   */  public void printHits(){    try{      System.out.println("Total hits number:"+hits.totalHits);      for(ScoreDoc doc: hits.scoreDocs){        Document document = indexSearcher.doc(doc.doc);        System.out.println(document.get("url"));        System.out.println(document.get("text"));      }      reader.close();    }catch (Exception e) {      e.printStackTrace();    }  }  public static void main(String[] args) {    /*輸入關鍵詞*/    Scanner in = new Scanner(System.in);    System.out.println("Enter path of the index:");    String path = in.nextLine().trim();    while(path.length()==0){      System.out.println("Enter path of the index:");      path = in.nextLine().trim();    }    System.out.println("Enter max hit number:");    int max = in.nextInt();    while(max<0){      System.out.println("Enter max hit number:");      max = in.nextInt();    }    in.nextLine();    System.out.print("Search>>> ");    String text = in.nextLine().trim();    /*循環讀入用戶的關鍵詞，如果是q則退出，長度為0也退出*/    while(!text.equals("q")){      if(text.length()>0){        SearchIndex search = new SearchIndex(path);        int hits = search.search("text", text, max);        if(hits!=-1){          search.printHits();        }      }      System.out.print("Search>>> ");      text = in.nextLine().trim();    }  }}

UI界面（這里為了方便只是命令行的形式，可以根據需求寫一個GUI界面）

package webCrawler.UI;import java.util.Scanner;import webCrawler.Index.SearchIndex;/** * @author lannooo * */public class UI {  public static void main(String[] args) {    /*輸入關鍵詞*/    Scanner in = new Scanner(System.in);    System.out.print("Search>>> ");    String text = in.nextLine().trim();    /*對于用戶的關鍵詞，如果是q則退出，長度為0也退出*/    while(!text.equals("q") && text.length()>0){      SearchIndex search = new SearchIndex("d:/index-spider2");      int hits = search.search("text", text, 20);      if(hits!=-1){        search.printHits();      }      System.out.print("Search>>> ");      text = in.nextLine().trim();    }  }}

以上就是本文的全部內容，希望對大家的學習有所幫助，也希望大家多多支持VeVb武林網。

注：相關教程知識閱讀請移步到JAVA教程頻道。

上一篇：Java實現的微信圖片處理工具類【裁剪，合并，等比例縮放等】

下一篇：Java編程實現軌跡壓縮算法開放窗口實例代碼