开始准备
Chrome

chromedriver 驱动( 下载不同版本浏览器对应驱动教程 )
代码
pom 依赖:
<dependencies> <!-- 添加浏览器驱动 --> <dependency> <groupId>org.seleniumhq.selenium</groupId> <artifactId>selenium-server</artifactId> <version>3.141.59</version> </dependency> <dependency> <groupId>cn.hutool</groupId> <artifactId>hutool-all</artifactId> <version>4.5.6</version> </dependency> </dependencies>
Main 代码:
import cn.hutool.core.date.DateUtil;
import cn.hutool.core.lang.Console;
import cn.hutool.core.util.StrUtil;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriverService;
import org.openqa.selenium.remote.DesiredCapabilities;
import org.openqa.selenium.remote.RemoteWebDriver;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
public class LagouSpider { private static ArrayList<String> strings = new ArrayList<>();
public static void main(String[] args) { String webDriverPath = LagouSpider.class.getResource("chromedriver.exe").getPath();
// 这里需要注意一定要和打开的 Chrome 版本匹配
System.setProperty("webdriver.chrome.driver", webDriverPath);
// 构建驱动
ChromeDriverService service = new ChromeDriverService.Builder().
usingDriverExecutable(new File(webDriverPath)).usingAnyFreePort().build();
try { service.start();
} catch (IOException e) { e.printStackTrace();
}
// 获取 Web 驱动
WebDriver driver = new RemoteWebDriver(service.getUrl(), DesiredCapabilities.chrome());
String url = "https://tophub.today/";
// 访问页面
driver.get(url);
for (int i = 0; i < 200; i++) { for (int j = 0; j <= 10; j++) { try { run(driver, webDriverPath, url, i, j);
} catch (Exception e) {
continue;
}
}
}
Console.log(strings);
// 退出驱动线程
driver.quit();
// 关闭 service 服务
service.stop();
}
public static void run(WebDriver driver, String webDriverPath, String url, int i, int j) {
String titleExpression =
"//div[@id='node-" + i + "']/div/div[contains(@class, 'cc-cd-ih')]/div[contains(@class, 'cc-cd-is')]/a/div[contains(@class, 'cc-cd-lb')]";
String contentExpression =
"//div[@id='node-" + i + "']/div/div[contains(@class, 'cc-cd-cb nano has-scrollbar')]/div[contains(@class, 'cc-cd-cb-l nano-content')]/a[" + j + "]/div[contains(@class, 'cc-cd-cb-ll')]/span[contains(@class, 't')]";
// 获取标题
WebElement titleElement = driver.findElement(By.xpath(titleExpression));
String titleElementText = titleElement.getText();
if (StrUtil.isNotEmpty(titleElementText)) { boolean hasStr = -1 == strings.indexOf(titleElementText);
if (hasStr) { strings.add(titleElementText);
Console.log(StrUtil.format("[{}]",
titleElementText));
}
WebElement textChildEle = driver.findElement(By.xpath(contentExpression));
if (StrUtil.isNotEmpty(textChildEle.getText())) { Console.log(StrUtil.format("\t[{}]\t[{}]({})",
DateUtil.now(),
textChildEle.getText()),
textChildEle.findElement(By.xpath("//div[@id='node-" + i + "']/div/div[contains(@class, 'cc-cd-cb nano has-scrollbar')]/div[contains(@class, 'cc-cd-cb-l nano-content')]/a[" + j + "]")).getAttribute("href"));
}
/*try { Thread.sleep(1000L);
} catch (InterruptedException e) { Console.error("Thread sleep Error");
}*/
}
// 获取内容列表
/*List<WebElement> textParentEle = driver.findElements(By.xpath(contentExpression));
for (WebElement textChildEle : textParentEle) { WebElement childEleElement = textChildEle.findElement(By.className("t"));
boolean isEmptyForText = StrUtil.isEmpty(childEleElement.getText());
if (isEmptyForText) {
continue;
}
Console.log(StrUtil.format("\t[{}]\t[{}]\r\n",
DateUtil.now(),
textChildEle.getText()));
}*/
}
}其中相关于 XPath 知识请 进入查看 Xpath 相关:
https://blog.csdn.net/u011541946/article/details/73323911
https://blog.csdn.net/u011541946/article/details/67639423
目录
开始准备
代码
目录
开始准备
代码