main
package com.liang;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.junit.Test;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
public class GetJSDocument1 {
public static void main(String[] args) throws IOException, InterruptedException {
GetJSDocument1 getJSDocument = new GetJSDocument1();
try {
getJSDocument.getDoc();
} catch (InterruptedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
@Test
public void getDoc() throws InterruptedException, IOException {
// String url = "https://blog.cool88.top";
// 屏蔽HtmlUnit等系统 log
// LogFactory.getFactory().setAttribute("org.apache.commons.logging.Log","org.apache.commons.logging.impl.NoOpLog");
java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF);
java.util.logging.Logger.getLogger("org.apache.http.client").setLevel(Level.OFF);
// HtmlUnit 模拟浏览器
WebClient webClient = new WebClient(BrowserVersion.CHROME);
webClient.getOptions().setJavaScriptEnabled(true); // 启用JS解释器,默认为true
webClient.getOptions().setCssEnabled(false); // 禁用css支持
webClient.getOptions().setThrowExceptionOnScriptError(false); // js运行错误时,是否抛出异常
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
// 支持ajax
webClient.setAjaxController(new NicelyResynchronizingAjaxController());
webClient.getOptions().setTimeout(10 * 1000); // 设置连接超时时间
for (int i = 7666; i < 9132; i++) {
String url = "http://www.xiaonongminxs.com/chongfantian/" + i + ".html";
url = url.trim();
System.out.println(url);
try {
extracted(url, webClient);
} catch (Exception e) {
i--;
e.printStackTrace();
}
}
/*
Elements clearfix = document.select(".clearfix"); //章节标题
Elements h4 = clearfix.select("h4"); //章节标题
Elements novelcontent= document.select("#partContent");//文章内容
Elements ps = novelcontent.select("p");// 一个段落
//拼接一个章节
StringBuilder stringBuilder=new StringBuilder();
for (Element element : h4) {
System.out.println(element.text());
stringBuilder.append(element.text());
stringBuilder.append("\r\n");
stringBuilder.append("\r\n");
}
for (Element p : ps) {
System.out.println(p.text());
stringBuilder.append(p.text());
stringBuilder.append("\r\n");
stringBuilder.append("\r\n");
}
NovelWrite novelWrite=new NovelWrite();
novelWrite.writenovel(stringBuilder);
*/
/* for (Element picture : pictures) {
//
String imgsrc = picture.attr("src");
System.out.println(imgsrc);
Download download = new Download();
download.downloadPicture("https://pic.netbian.com" + imgsrc);
}
*/
/* for (int i = 0; i < 50; i++) {
try {
//取得 加载更多 的按钮
HtmlAnchor alink = htmlPage.getAnchorByText("加载更多");
//模拟点击
htmlPage = alink.click();
//等待js执行10秒
} catch (ElementNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
webClient.waitForBackgroundJavaScript(10 * 1000); // 等待js后台执行30秒
}*/
// System.out.println(document);
/* Element teamList = document.getElementById("team_list");
Elements pictures = teamList.select("img[src]");
for (Element pic : pictures) {
System.out.println(pic.attr("src"));
Download.downloadPicture(pictureUrl+ (String)pic.attr("src"));
}*/
// download.downloadPicture("http://cloud.liangblog.xyz/data/User/admin/home/img2/300.jpg");
}
private void extracted(String url, WebClient webClient) throws IOException, InterruptedException {
HtmlPage htmlPage = webClient.getPage(url);
Thread.sleep(1000);
webClient.waitForBackgroundJavaScript(3 * 1000); // 等待js后台执行30秒
// 解析网页
String pageAsXml = htmlPage.asXml();
//System.out.println(pageAsXml);
Document document = Jsoup.parse(pageAsXml);
// String texttitel = document.select(".chaptertitle.clearfix").text();
StringBuilder stringBuilder = new StringBuilder();
// stringBuilder.append(texttitel);
stringBuilder.append("\r\n");
String textcontext = document.select(".panel-body").html();
Elements mtitle = document.select(".m-title");
String h1 = mtitle.select("h1").text();
stringBuilder.append(h1);
stringBuilder.append("\r\n");
//换行问题
String s = textcontext.replaceAll("<br>", "\r\n");
s = s.replaceAll(" ", "");
s = s.substring(s.lastIndexOf("</div"), s.length());
s = s.replaceAll("</div", "");
// System.out.println(s);
stringBuilder.append(s);
System.out.println(stringBuilder.toString());
NovelWrite novelWrite = new NovelWrite();
novelWrite.writenovel(stringBuilder);
}
}
write
package com.liang;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
public class NovelWrite {
public void writenovel(StringBuilder stringBuilder){
File file =new File("C:\\Users\\Fangliang\\Desktop\\a\\123.txt");
try {
FileWriter fileWriter=new FileWriter(file,true);
BufferedWriter bufferedWriter=new BufferedWriter(fileWriter);
bufferedWriter.write(stringBuilder.toString());
bufferedWriter.close();
fileWriter.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}