开始准备
Chrome
chromedriver 驱动( 下载不同版本浏览器对应驱动教程 )
代码
pom 依赖:
<dependencies> <!-- 添加浏览器驱动 --> <dependency> <groupId>org.seleniumhq.selenium</groupId> <artifactId>selenium-server</artifactId> <version>3.141.59</version> </dependency> <dependency> <groupId>cn.hutool</groupId> <artifactId>hutool-all</artifactId> <version>4.5.6</version> </dependency> </dependencies>
Main 代码:
import cn.hutool.core.date.DateUtil; import cn.hutool.core.lang.Console; import cn.hutool.core.util.StrUtil; import org.openqa.selenium.By; import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; import org.openqa.selenium.chrome.ChromeDriverService; import org.openqa.selenium.remote.DesiredCapabilities; import org.openqa.selenium.remote.RemoteWebDriver; import java.io.File; import java.io.IOException; import java.util.ArrayList; public class LagouSpider { private static ArrayList<String> strings = new ArrayList<>(); public static void main(String[] args) { String webDriverPath = LagouSpider.class.getResource("chromedriver.exe").getPath(); // 这里需要注意一定要和打开的 Chrome 版本匹配 System.setProperty("webdriver.chrome.driver", webDriverPath); // 构建驱动 ChromeDriverService service = new ChromeDriverService.Builder(). usingDriverExecutable(new File(webDriverPath)).usingAnyFreePort().build(); try { service.start(); } catch (IOException e) { e.printStackTrace(); } // 获取 Web 驱动 WebDriver driver = new RemoteWebDriver(service.getUrl(), DesiredCapabilities.chrome()); String url = "https://tophub.today/"; // 访问页面 driver.get(url); for (int i = 0; i < 200; i++) { for (int j = 0; j <= 10; j++) { try { run(driver, webDriverPath, url, i, j); } catch (Exception e) { continue; } } } Console.log(strings); // 退出驱动线程 driver.quit(); // 关闭 service 服务 service.stop(); } public static void run(WebDriver driver, String webDriverPath, String url, int i, int j) { String titleExpression = "//div[@id='node-" + i + "']/div/div[contains(@class, 'cc-cd-ih')]/div[contains(@class, 'cc-cd-is')]/a/div[contains(@class, 'cc-cd-lb')]"; String contentExpression = "//div[@id='node-" + i + "']/div/div[contains(@class, 'cc-cd-cb nano has-scrollbar')]/div[contains(@class, 'cc-cd-cb-l nano-content')]/a[" + j + "]/div[contains(@class, 'cc-cd-cb-ll')]/span[contains(@class, 't')]"; // 获取标题 WebElement titleElement = driver.findElement(By.xpath(titleExpression)); String titleElementText = titleElement.getText(); if (StrUtil.isNotEmpty(titleElementText)) { boolean hasStr = -1 == strings.indexOf(titleElementText); if (hasStr) { strings.add(titleElementText); Console.log(StrUtil.format("[{}]", titleElementText)); } WebElement textChildEle = driver.findElement(By.xpath(contentExpression)); if (StrUtil.isNotEmpty(textChildEle.getText())) { Console.log(StrUtil.format("\t[{}]\t[{}]({})", DateUtil.now(), textChildEle.getText()), textChildEle.findElement(By.xpath("//div[@id='node-" + i + "']/div/div[contains(@class, 'cc-cd-cb nano has-scrollbar')]/div[contains(@class, 'cc-cd-cb-l nano-content')]/a[" + j + "]")).getAttribute("href")); } /*try { Thread.sleep(1000L); } catch (InterruptedException e) { Console.error("Thread sleep Error"); }*/ } // 获取内容列表 /*List<WebElement> textParentEle = driver.findElements(By.xpath(contentExpression)); for (WebElement textChildEle : textParentEle) { WebElement childEleElement = textChildEle.findElement(By.className("t")); boolean isEmptyForText = StrUtil.isEmpty(childEleElement.getText()); if (isEmptyForText) { continue; } Console.log(StrUtil.format("\t[{}]\t[{}]\r\n", DateUtil.now(), textChildEle.getText())); }*/ } }
其中相关于 XPath 知识请 进入查看 Xpath 相关:
https://blog.csdn.net/u011541946/article/details/73323911
https://blog.csdn.net/u011541946/article/details/67639423
目录
开始准备
代码
目录
开始准备
代码