Java爬虫,通过店铺链接爬取店铺商品数据,爬取JUMIA商品数据

Java爬虫,通过店铺链接爬取店铺商品数据,爬取JUMIA商品数据

必备依赖

<dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.15.3</version> <!-- 请检查是否有更新版本 -->
        </dependency>

代码示例

package org.iptv.demo1.python;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import static org.iptv.demo1.utils.Sample.transientions;

public class WebScraper {
    public static void main(String[] args) {

        try {
            System.out.println(transientions("正在连接到商品列表页面..."));
            // 店铺地址
            String url = "https://www.jumia.com.ng/hyxtop-cod/";
            // 获取商品列表页面
            Document doc = Jsoup.connect(url)
                    .userAgent("MyBot/1.0 (https://mywebsite.com; contact@mywebsite.com)")
                    .timeout(10000)
                    .get();

            System.out.println(transientions("连接成功,正在解析商品列表..."));

            // 获取商品列表的父容器
            Elements productDivs = doc.select("div.-paxs.row._no-g._4cl-3cm-shs");

            if (productDivs.isEmpty()) {
                System.out.println(transientions("未找到商品列表容器,请检查页面结构!"));
                return;
            }

            System.out.println(transientions("发现商品数量: " + productDivs.select("article.prd._fb.col.c-prd").size()));

            for (Element productDiv : productDivs.select("article.prd._fb.col.c-prd")) {
                // 获取商品详情链接
                Element linkElement = productDiv.select("a.core").first();
                if (linkElement != null) {
                    String productLink = "https://www.jumia.com.ng" + linkElement.attr("href");
                    System.out.println(transientions("发现商品,详情链接:" + productLink));

                    // 获取商品详情页面
                    Document productDoc = Jsoup.connect(productLink)
                            .userAgent("MyBot/1.0 (https://mywebsite.com; contact@mywebsite.com)")
                            .timeout(10000)
                            .get();

                    System.out.println(transientions("正在解析商品详情页面..."));

                    // 获取商品标题
                    String title = productDoc.select("h1.-fs20.-pts.-pbxs").text();
                    System.out.println(transientions("商品标题:" + title));

                    // 获取优惠前价格
                    String originalPrice = productDoc.select("span.-tal.-gy5.-lthr.-fs16.-pvxs.-ubpt").text();
                    if (originalPrice.isEmpty()) {
                        originalPrice = "无";
                    }
                    System.out.println(transientions("优惠前价格:" + originalPrice));

                    // 获取优惠后价格
                    String discountPrice = productDoc.select("span.-b.-ubpt.-tal.-fs24.-prxs").text();
                    System.out.println(transientions("优惠后价格:" + discountPrice));

                    // 获取商品数量
                    String stockInfo = productDoc.select("span.-fsh0.-prs.-fs12").text();
                    if (stockInfo.isEmpty()) {
                        stockInfo = productDoc.select("span.-df.-i-ctr.-fs12.-pbs.-yl7").text();
                    }
                    if (stockInfo.isEmpty()) {
                        stockInfo = "无";
                    }
                    System.out.println(transientions("商品数量:" + stockInfo));

                    // 获取商品特点
                    Elements features = productDoc.select("div.markup.-pam ul li");
                    System.out.println(transientions("商品特点:"));
                    for (Element feature : features) {
                        System.out.println(transientions("  - " + feature.text()));
                    }

                    // 获取商品规格
                    Elements specifications = productDoc.select("ul.-pvs.-mvxs.-phm.-lsn li");
                    System.out.println(transientions("商品规格:"));
                    for (Element spec : specifications) {
                        System.out.println(transientions("  - " + spec.text()));
                    }

                    // 获取商品描述和图片
                    Elements descriptionDiv = productDoc.select("div.markup.-mhm.-pvl.-oxa.-sc");
                    if (!descriptionDiv.isEmpty()) {
                        System.out.println(transientions("商品描述和图片:"));
                        // 优先处理 <p> 标签的内容
                        boolean descriptionFound = false;
                        for (Element desc : descriptionDiv.select("p")) {
                            String description = desc.text();
                            if (!description.isEmpty()) {
                                descriptionFound = true;
                                System.out.println(transientions("  描述:" + description));
                            }

                            // 获取图片地址
                            Elements images = desc.select("img");
                            for (Element img : images) {
                                String imageUrl = img.attr("data-src");
                                if (imageUrl.isEmpty()) {
                                    imageUrl = img.attr("src");
                                }
                                if (!imageUrl.isEmpty()) {
                                    System.out.println(transientions("  图片地址:" + imageUrl));
                                } else {
                                    System.out.println(transientions("  图片地址未找到!"));
                                }
                            }
                        }

                        // 如果没有从 <p> 中找到描述,尝试直接从 div 提取文本
                        if (!descriptionFound) {
                            String fallbackDescription = descriptionDiv.text();
                            if (!fallbackDescription.isEmpty()) {
                                System.out.println(transientions("  直接提取描述:" + fallbackDescription));
                            } else {
                                System.out.println(transientions("  描述未找到!"));
                            }
                        }

                        // 处理 <div> 内的图片
                        for (Element img : descriptionDiv.select("img")) {
                            String imageUrl = img.attr("data-src");
                            if (imageUrl.isEmpty()) {
                                imageUrl = img.attr("src");
                            }
                            if (!imageUrl.isEmpty()) {
                                System.out.println(transientions("  图片地址:" + imageUrl));
                            } else {
                                System.out.println(transientions("  图片地址未找到!"));
                            }
                        }
                    } else {
                        System.out.println(transientions("未找到商品描述!"));
                    }

                }
                // 暂停3秒
                System.out.println("------------等待3秒,避免抓取过快,导致Ip被封------------");
                Thread.sleep(3000);
            }

            System.out.println(transientions("商品信息解析完成!"));
        } catch (Exception e) {
            System.out.println(transientions("抓取过程中出现错误:" + e.getMessage()));
            e.printStackTrace();
        }
    }
}

需要注意,这个网站是国外的,我打印的时候通过了翻译,翻译自行去写

温馨提示: 本文最后更新于2024-11-30 20:57:59,某些文章具有时效性,若有错误或已失效,请在下方 留言
© 版权声明
THE END
喜欢就支持一下吧
点赞5赞赏 分享
评论 抢沙发

请登录后发表评论

    暂无评论内容