introduction
Apply Java's open source library to write a search engine that can crawl the content of a website. And deeply crawl based on the content of the web page to obtain all relevant web page addresses and content. Users can search for all relevant web addresses through keywords.
Specific functions
(1) The user can specify the content of the web page corresponding to a url.
(2) Parses the content of the web page and obtains all URL link addresses.
(3) The user can set the crawling depth, which means that starting from the page corresponding to the initial url, the urls in the web page corresponding to all urls can be crawled, and so on. The greater the depth, the more websites you can crawl.
(4) Save and index the crawled url content. The indexed content is the URL address itself and the web page title corresponding to the URL.
(5) Users can search the URL through keywords to find the URL with the keyword.
(6) The process of establishing indexes and searching indexes can intelligently identify Chinese keywords and perform word segmentation operations on keywords.
(7) The user can specify the address to save the index, the initial URL, the crawl depth, the keywords to search, and the maximum match.
Open Source Framework
Source code
Crawler part: Spider.java
package webCrawler.Spider;import java.io.IOException;import java.util.ArrayList;import java.util.HashSet;import java.util.Scanner;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;import webCrawler.Index.BuildIndex;/** * @author lannooo */public class Spider { ArrayList<String> URLs; private String startURL; private int digLevel; /** * @param startURL The crawler's start URL * @param digLevel crawl depth*/ public Spider(String startURL, int digLevel){ this.startURL = startURL; this.digLevel = digLevel; this.URLs = new ArrayList<>(); } /** * @param level The current crawl depth remaining* @param arrayList The URL set that needs to be crawled in the next round* @return The new URL set crawled from a single-frame url set* @throws IOException */ public ArrayList<String> getLevelURLs(int level, ArrayList<String> arrayList) throws IOException{ ArrayList<String> total = null; if(level>0){ total = new ArrayList<>(); for(String url: arrayList){ /*For each URL in each arrayList, first parse its web page content and get all URL entries in it*/ for(String each: getBareLinks(url)){ total.add(each); } } /*Use the HashSet container to delete duplicates in total*/ HashSet<String> hashSet = new HashSet<>(total); total = new ArrayList<>(hashSet); } return total; } /** * Start from startURL, crawl all relevant URLs * @throws IOException */ public void getAll() throws IOException{ ArrayList<String> newURLs; ArrayList<String> currentURLs = new ArrayList<>(); /*Add startURL into the list of currentURLs and start crawling from this url*/ currentURLs.add(startURL); for(int i=digLevel; i>0; i--){ /* * For each layer, you must get the url set extended by this url* Then add the crawled urls of the current set to the total URL set* Finally, newURLs enter the next round of loop as a new set that needs to be deeply crawled*/ System.out.println("Dig into level: " + (digLevel-i+1)); newURLs = getLevelURLs(i, currentURLs); for(String each: currentURLs){ URLs.add(each); } currentURLs = newURLs; } for(String each:currentURLs){ URLs.add(each); } HashSet<String> hashSet = new HashSet<>(URLs); URLs = new ArrayList<>(hashSet); } /** * @param path The path to save the index* @throws IOException */ public void storeURLsAndInfo(String path) throws IOException{ BuildIndex build = new BuildIndex(path); /* Crawl all URLs in URLs into actual web page title*/ for(String each:URLs){ String text = getLinkText(each); if(text!=null){ build.addField("url", each); build.addField("text", text); /*Put this entry into the index*/ build.pushIndex(); } } build.close(); } /** * @param url You need to get the url of the web page title * @return title content* @throws IOException */ public String getLinkText(String url) throws IOException{ Document document = null; try { /*Connect with Jsoup, set the timeout to 3 seconds*/ document = Jsoup.connect(url).timeout(3000).get(); } catch (Exception e) { System.out.println("[TIMEOUT]Get title of url:"+url); return null; } String title = document.title(); return title; } /** * @param url url for content parsing * @return Returns the list of all urls in the web page content of the url* @throws IOException */ public ArrayList<String> getBareLinks(String url) throws IOException{ ArrayList<String> linksList = new ArrayList<>(); Document document; try { document = Jsoup.connect(url).timeout(2000).get(); } catch (Exception e) { return linksList; } /*Get all <a> tags with href attributes of the <body> tag*/ Elements links = document.select("body").select("a[href]"); for(Element link: links){ /*Extract the url from each parsed <a> tag and remove the anchor */ String href = link.attr("abs:href").replaceAll("#", ""); /*Only add the url with zju.edu.cn characters, remove the '/'*/ if(href.contains("zju.edu.cn")){ if (href.endsWith("/")){ href = href.substring(0, href.length()-1); } linksList.add(href); } } HashSet<String> hashSet = new HashSet<>(linksList); ArrayList<String> arrayList = new ArrayList<>(hashSet); return arrayList; } public static void main(String[] args) { Scanner in = new Scanner(System.in); System.out.println("Enter url:"); String url = in.nextLine().trim(); while(!url.startsWith("http://")){ System.out.println("http:// is needed!"); System.out.println("Enter url:"); url = in.nextLine().trim(); } System.out.println("Enter depth to dig more urls[<=3 recommended]:"); int depth = in.nextInt(); Spider spider = new Spider(url, depth); System.out.println("Enter path you want to save[default=d:/index-spider]:"); String path = in.nextLine().trim(); if(path.length()==0){ path = "d:/index-spider"; } try { System.out.println("Start fetching..."); spider.getAll(); System.out.println("Urls got success!"); spider.storeURLsAndInfo(path); System.out.println("Stored success!"); } catch (IOException e) { e.printStackTrace(); } } }BuildIndex.java
package webCrawler.Index;import java.io.*;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.TextField;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.util.Version;import org.wltea.analyzer.lucene.IKAnalyzer;/** * @author lannooo * */public class BuildIndex { private File file; private Directory directory; private IndexWriter indexWriter; private IndexWriterConfig config; private Analyzer analyzer; private Document document; /** * @param path indexing path*/ public BuildIndex(String path) { try { file = new File(path); directory = FSDirectory.open(file); document = new Document(); analyzer = new IKAnalyzer(); /*Chinese word segmentation tool class*/ config = new IndexWriterConfig(Version.LUCENE_4_10_0, analyzer); indexWriter = new IndexWriter(directory, config); } catch (Exception e) { e.printStackTrace(); } } /** * @param fieldName The name of the new item added to the document* @param fieldText The content of the new item*/ public void addField(String fieldName, String fieldText){ try{ Field field = new TextField(fieldName, fieldText, Field.Store.YES); document.add(field); } catch (Exception e) { e.printStackTrace(); } } /** * Add document to the index*/ public void pushIndex(){ try { indexWriter.addDocument(document); document = new Document(); } catch (Exception e) { e.printStackTrace(); } } /** * Add a complete document and save it to the index* @param url The URL added to the URL* @param text url corresponding text*/ public void addOneIndex(String url, String text){ this.addField("url", url); this.addField("text", text); this.pushIndex(); } /** * Close index writing*/ public void close(){ try { indexWriter.close(); } catch (Exception e) { e.printStackTrace(); } }}Search Index
package webCrawler.Index;import java.io.File;import java.util.Scanner;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.document.Document;import org.apache.lucene.index.DirectoryReader;import org.apache.lucene.queryparser.classic.QueryParser;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TopDocs;import org.apache.lucene.store.FSDirectory;import org.wltea.analyzer.lucene.IKAnalyzer;/** * @author lannooo * */public class SearchIndex { private IndexSearcher indexSearcher; private Analyzer analyzer; private QueryParser parser; private Query query; private TopDocs hits; private DirectoryReader reader; /** * @param path The path for index search*/ public SearchIndex(String path){ try { reader = DirectoryReader.open(FSDirectory.open(new File(path))); indexSearcher = new IndexSearcher(reader); analyzer = new IKAnalyzer(); } catch (Exception e) { e.printStackTrace(); } } /** * @param fieldName The domain name of the search* @param text The content of the search* @param matchNumber The maximum number of matches* @return The maximum number of matches found */ public int search(String fieldName, String text, int matchNumber){ try { parser = new QueryParser(fieldName, analyzer); query = parser.parse(text); hits = indexSearcher.search(query, matchNumber); return hits.totalHits; } catch (Exception e) { e.printStackTrace(); } return -1; } /** * Print all matches*/ public void printHits(){ try{ System.out.println("Total hits number:"+hits.totalHits); for(ScoreDoc doc: hits.scoreDocs){ Document document = indexSearcher.doc(doc.doc); System.out.println(document.get("url")); System.out.println(document.get("text")); } reader.close(); } catch (Exception e) { e.printStackTrace(); } } public static void main(String[] args) { /*Input keyword*/ Scanner in = new Scanner(System.in); System.out.println("Enter path of the index:"); String path = in.nextLine().trim(); while(path.length()==0){ System.out.println("Enter path of the index:"); path = in.nextLine().trim(); } System.out.println("Enter max hit number:"); int max = in.nextInt(); while(max<0){ System.out.println("Enter max hit number:"); max = in.nextInt(); } in.nextLine(); System.out.print("Search>>> "); String text = in.nextLine().trim(); /* Looping into the user's keyword, if it is q, it exits, and the length is 0 is also exited*/ while(!text.equals("q")){ if(text.length()>0){ SearchIndex search = new SearchIndex(path); int hits = search.search("text", text, max); if(hits!=-1){ search.printHits(); } } System.out.print("Search>>> "); text = in.nextLine().trim(); } }}UI interface (For convenience, it is just a command line format, you can write a GUI interface according to your needs)
package webCrawler.UI;import java.util.Scanner;import webCrawler.Index.SearchIndex;/** * @author lannooo * */public class UI { public static void main(String[] args) { /*Input keywords*/ Scanner in = new Scanner(System.in); System.out.print("Search>>> "); String text = in.nextLine().trim(); /*For user keywords, if it is q, it will exit, and the length is 0 will exit*/ while(!text.equals("q") && text.length()>0){ SearchIndex search = new SearchIndex("d:/index-spider2"); int hits = search.search("text", text, 20); if(hits!=-1){ search.printHits(); } System.out.print("Search>>> "); text = in.nextLine().trim(); } }}The above is all the content of this article. I hope it will be helpful to everyone's learning and I hope everyone will support Wulin.com more.