I have always wanted to practice java multi-threading data grabbing.
One day I discovered that the official website of Rings Duoduo (http://www.shoujiduoduo.com/main/) has a large amount of data.
By observing their front-end ajax to get ringtone data
http://www.shoujiduoduo.com/ringweb/ringweb.php?type=getlist&listid={Category ID}&page={Pagination Page Number}
It is easy to find that by changing the listId and page, you can get the json data of the ringtone from the server, and by parsing the json data,
You can see that they all have instructions like {"hasmore":1,"curpage":1}. By judging the value of hasmore, we decide whether to crawl the next page.
However, the download address in json returned through the above link without ringtones is not available in json
You will soon find that you will see it by clicking on the "Download" on the page.
Through the request below, you can get the download address of the ringtone
http://www.shoujiduoduo.com/ringweb/ringweb.php?type=geturl&act=down&rid={ringtone ID}
Therefore, their data is easily stolen. So I started...
The source code has been posted on github. If you are interested in children's shoes, please check it out
github: https://github.com/yongbo000/DuoduoAudioRobot
On code:
package me.yongbo.DuoduoRingRobot;import java.io.BufferedReader;import java.io.File;import java.io.FileWriter;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.net.URL;import java.net.URLConnection;import java.util.Iterator;import java.util.regex.Matcher;import java.util.regex.Pattern;import com.google.gson.Gson;import com.google.gson.JsonArray;import com.google.gson.JsonElement;import com.google.gson.JsonParser;/* * @author yongbo_ * @created 2013/4/16 * * */public class DuoduoRingRobotClient implements Runnable {public static String GET_RINGINFO_URL = "http://www.shoujiduoduo.com/ringweb/ringweb.php?type=getlist&listid=%1$d&page=%2$d";public static String GET_DOWN_URL = "http://www.shoujiduoduo.com/ringweb/ringweb.php?type=geturl&act=down&rid=%1$d";public static String ERROR_MSG = "An error occurred with listId of %1$d and it has been stopped automatically. The current page is %2$d";public static String STATUS_MSG = "Start crawling data, current listId: %1$d, current page: %2$d";public static String FILE_DIR = "E:/RingData/";public static String FILE_NAME = "listId=%1$d.txt";private boolean errorFlag = false;private int listId;private int page;private int endPage = -1;private int hasMore = 1;private DbHelper dbHelper;/** * Constructor* @param listId Menu ID * @param page Start page number* @param endPage End page number* */public DuoduoRingRobotClient(int listId, int beginPage, int endPage) {this.listId = listId;this.page = beginPage;this.endPage = endPage;this.dbHelper = new DbHelper();}/** * Constructor* @param listId Menu ID * @param page Start page number* */public DuoduoRingRobotClient(int listId, int page) {this(listId, page, -1);}/** * Get ringtone* */public void getRings() {String url = String.format(GET_RINGINFO_URL, listId, page);String responseStr = httpGet(url);hasMore = getHasmore(responseStr);page = getNextPage(responseStr);ringParse(responseStr.replaceAll("//{/"hasmore/":[0-9]*,/"curpage/":[0-9]*//},", "").replaceAll(",]", "]"));}/** * Initiate http request* @param webUrl Request connection address* */public String httpGet(String webUrl){URL url;URLConnection conn;StringBuilder sb = new StringBuilder();String resultStr = "";try {url = new URL(webUrl);conn = url.openConnection();conn.connect();InputStream is = conn.getInputStream();InputStreamReader isr = new InputStreamReader(is);BufferedReader bufReader = new BufferedReader(isr);String lineText; while ((lineText = bufReader.readLine()) != null) {sb.append(lineText);}resultStr = sb.toString();} catch (Exception e) {errorFlag = true;//Write an error to txtwriteToFile(String.format(ERROR_MSG, listId, page));}return resultStr;}/** * Convert the json string into a Ring object and save it in txt* @param json Json string* */public void ringParse(String json) {Ring ring = null;JsonElement element = new JsonParser().parse(json);JsonArray array = element.getAsJsonArray();// traverse the array Iterator<JsonElement> it = array.iterator();Gson gson = new Gson();while (it.hasNext() && !errorFlag) {JsonElement e = it.next();// Convert JsonElement to JavaBean object ring = gson.fromJson(e, Ring.class);ring.setDownUrl(getRingDownUrl(ring.getId()));if(isAvailableRing(ring)) {System.out.println(ring.toString());//You can choose to write to the database or to text//writeToFile(ring.toString());writeToDatabase(ring);}}}/** * Write to txt * @param data String* */public void writeToFile(String data) {String path = FILE_DIR + String.format(FILE_NAME, listId);File dir = new File(FILE_DIR);File file = new File(path);FileWriter fw = null;if(!dir.exists()){dir.mkdirs();}try {if(!file.exists()){file.createNewFile();}fw = new FileWriter(file, true);fw.write(data);fw.write("/r/n");fw.flush();} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();} finally {try {if(fw != null){fw.close();}} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}}}/** * Write to database* @param ring An instance of Ring* */public void writeToDatabase(Ring ring) {dbHelper.execute("addRing", ring);}@Overridepublic void run() {while(hasMore == 1 && !errorFlag){if(endPage != -1){if(page > endPage) { break; }}System.out.println(String.format(STATUS_MSG, listId, page));getRings();System.out.println(String.format("The data written on this page is completed"));}System.out.println("ending...");}private int getHasmore(String resultStr){Pattern p = Pattern.compile("/"hasmore/":([0-9]*),/"curpage/":([0-9]*)"); Matcher match = p.matcher(resultStr); if (match.find()) { return Integer.parseInt(match.group(1)); } return 0;}private int getNextPage(String resultStr){Pattern p = Pattern.compile("/"hasmore/":([0-9]*),/"curpage/":([0-9]*)");Matcher match = p.matcher(resultStr);if (match.find()) {return Integer.parseInt(match.group(2));}return 0;}/** * Determine whether the current Ring meets the condition. When the Ring name is greater than 50 characters or the duration is a decimal, it does not meet the conditions and will be removed. * @param ring Current Ring object instance* */private boolean isAvailableRing(Ring ring){Pattern p = Pattern.compile("^[1-9][0-9]*$");Matcher match = p.matcher(ring.getDuration());if(!match.find()){return false;}if(ring.getName().length() > 50 || ring.getArtist().length() > 50 || ring.getDownUrl().length() == 0){return false;}return true;}/** * Get the download address of the ringtone* @param rid ringtone id * */public String getRingDownUrl(String rid){String url = String.format(GET_DOWN_URL, rid);String responseStr = httpGet(url);return responseStr;}}