Cool
Cool
Published on 2021-06-20 / 42 Visits
0
0

java 抓取js动态页面开始一个htmlunit

pom依赖

 <dependency>
      <groupId>net.sourceforge.htmlunit</groupId>
      <artifactId>htmlunit</artifactId>
      <version>2.35.0</version>
    </dependency>
    <dependency>
      <groupId>org.jsoup</groupId>
      <artifactId>jsoup</artifactId>
      <version>1.12.1</version>
    </dependency>

代码


    String url = "https://pic.netbian.com/tupian/" + num + ".html";
    // String url = "https://blog.cool88.top";

    // 屏蔽HtmlUnit等系统 log
    //
    // LogFactory.getFactory().setAttribute("org.apache.commons.logging.Log","org.apache.commons.logging.impl.NoOpLog");
    java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF);
    java.util.logging.Logger.getLogger("org.apache.http.client").setLevel(Level.OFF);

    // HtmlUnit 模拟浏览器
    WebClient webClient = new WebClient(BrowserVersion.CHROME);
    webClient.getOptions().setJavaScriptEnabled(true); // 启用JS解释器,默认为true
    webClient.getOptions().setCssEnabled(false); // 禁用css支持
    webClient.getOptions().setThrowExceptionOnScriptError(false); // js运行错误时,是否抛出异常
    webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
    // 支持ajax
    webClient.setAjaxController(new NicelyResynchronizingAjaxController());
    webClient.getOptions().setTimeout(5 * 1000); // 设置连接超时时间
    HtmlPage htmlPage = webClient.getPage(url);
    Thread.sleep(1000);
    webClient.waitForBackgroundJavaScript(3 * 1000); // 等待js后台执行30秒

    // 解析网页
    String pageAsXml = htmlPage.asXml();
    //System.out.println(pageAsXml);
    Document document = Jsoup.parse(pageAsXml);

    Element element = document.getElementById("img");
    Elements pictures = element.select("img[src]");


Comment