Java使用Chrome驱动作爬虫以及示例

😂 这篇文章最后更新于1802天前,您需要注意相关的内容是否还可用。
目录导航
  • 开始准备
  • 代码
  • 开始准备

    1. Chrome

    2. chromedriver驱动(下载不同版本浏览器对应驱动教程

    代码

    pom依赖:

    <dependencies>
        <!--添加浏览器驱动-->
        <dependency>
            <groupId>org.seleniumhq.selenium</groupId>
            <artifactId>selenium-server</artifactId>
            <version>3.141.59</version>
        </dependency>
    
        <dependency>
            <groupId>cn.hutool</groupId>
            <artifactId>hutool-all</artifactId>
            <version>4.5.6</version>
        </dependency>
    </dependencies>

    Main代码:

    import cn.hutool.core.date.DateUtil;
    import cn.hutool.core.lang.Console;
    import cn.hutool.core.util.StrUtil;
    import org.openqa.selenium.By;
    import org.openqa.selenium.WebDriver;
    import org.openqa.selenium.WebElement;
    import org.openqa.selenium.chrome.ChromeDriverService;
    import org.openqa.selenium.remote.DesiredCapabilities;
    import org.openqa.selenium.remote.RemoteWebDriver;
    import java.io.File;
    import java.io.IOException;
    import java.util.ArrayList;
    
    public class LagouSpider {
        private static ArrayList<String> strings = new ArrayList<>();
    
        public static void main(String[] args) {
    
            String webDriverPath = LagouSpider.class.getResource("chromedriver.exe").getPath();
    
            // 这里需要注意一定要和打开的Chrome版本匹配
            System.setProperty("webdriver.chrome.driver", webDriverPath);
    
            // 构建驱动
            ChromeDriverService service = new ChromeDriverService.Builder().
                    usingDriverExecutable(new File(webDriverPath)).usingAnyFreePort().build();
            try {
                service.start();
            } catch (IOException e) {
                e.printStackTrace();
            }
    
            // 获取Web驱动
            WebDriver driver = new RemoteWebDriver(service.getUrl(), DesiredCapabilities.chrome());
    
            String url = "https://tophub.today/";
    
            // 访问页面
            driver.get(url);
    
    
            for (int i = 0; i < 200; i++) {
                for (int j = 0; j <= 10; j++) {
                    try {
                        run(driver, webDriverPath, url, i, j);
                    } catch (Exception e) {
                        continue;
                    }
                }
            }
            Console.log(strings);
    
            // 退出驱动线程
            driver.quit();
            // 关闭service服务
            service.stop();
        }
    
        public static void run(WebDriver driver, String webDriverPath, String url, int i, int j) {
            String titleExpression =
                    "//div[@id='node-" + i + "']/div/div[contains(@class, 'cc-cd-ih')]/div[contains(@class, 'cc-cd-is')]/a/div[contains(@class, 'cc-cd-lb')]";
            String contentExpression =
                    "//div[@id='node-" + i + "']/div/div[contains(@class, 'cc-cd-cb nano has-scrollbar')]/div[contains(@class, 'cc-cd-cb-l nano-content')]/a[" + j + "]/div[contains(@class, 'cc-cd-cb-ll')]/span[contains(@class, 't')]";
    
            // 获取标题
            WebElement titleElement = driver.findElement(By.xpath(titleExpression));
            String titleElementText = titleElement.getText();
            if (StrUtil.isNotEmpty(titleElementText)) {
                boolean hasStr = -1 == strings.indexOf(titleElementText);
                if (hasStr) {
                    strings.add(titleElementText);
                    Console.log(StrUtil.format("[{}]",
                            titleElementText));
                }
                WebElement textChildEle = driver.findElement(By.xpath(contentExpression));
                if (StrUtil.isNotEmpty(textChildEle.getText())) {
                    Console.log(StrUtil.format("\t[{}]\t[{}]({})",
                            DateUtil.now(),
                            textChildEle.getText()),
                            textChildEle.findElement(By.xpath("//div[@id='node-" + i + "']/div/div[contains(@class, 'cc-cd-cb nano has-scrollbar')]/div[contains(@class, 'cc-cd-cb-l nano-content')]/a[" + j + "]")).getAttribute("href"));
                }
    
                /*try {
                    Thread.sleep(1000L);
                } catch (InterruptedException e) {
                    Console.error("Thread sleep Error");
                }*/
            }
            // 获取内容列表
            /*List<WebElement> textParentEle = driver.findElements(By.xpath(contentExpression));
            for (WebElement textChildEle : textParentEle) {
                WebElement childEleElement = textChildEle.findElement(By.className("t"));
                boolean isEmptyForText = StrUtil.isEmpty(childEleElement.getText());
                if (isEmptyForText) {
                    continue;
                }
                Console.log(StrUtil.format("\t[{}]\t[{}]\r\n",
                        DateUtil.now(),
                        textChildEle.getText()));
            }*/
    
        }
    }

    其中相关于XPath知识请 进入查看Xpath相关:

    https://blog.csdn.net/u011541946/article/details/73323911

    https://blog.csdn.net/u011541946/article/details/67639423