必备依赖
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.15.3</version> <!-- 请检查是否有更新版本 -->
</dependency>
代码示例
package org.iptv.demo1.python;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import static org.iptv.demo1.utils.Sample.transientions;
public class WebScraper {
public static void main(String[] args) {
try {
System.out.println(transientions("正在连接到商品列表页面..."));
// 店铺地址
String url = "https://www.jumia.com.ng/hyxtop-cod/";
// 获取商品列表页面
Document doc = Jsoup.connect(url)
.userAgent("MyBot/1.0 (https://mywebsite.com; contact@mywebsite.com)")
.timeout(10000)
.get();
System.out.println(transientions("连接成功,正在解析商品列表..."));
// 获取商品列表的父容器
Elements productDivs = doc.select("div.-paxs.row._no-g._4cl-3cm-shs");
if (productDivs.isEmpty()) {
System.out.println(transientions("未找到商品列表容器,请检查页面结构!"));
return;
}
System.out.println(transientions("发现商品数量: " + productDivs.select("article.prd._fb.col.c-prd").size()));
for (Element productDiv : productDivs.select("article.prd._fb.col.c-prd")) {
// 获取商品详情链接
Element linkElement = productDiv.select("a.core").first();
if (linkElement != null) {
String productLink = "https://www.jumia.com.ng" + linkElement.attr("href");
System.out.println(transientions("发现商品,详情链接:" + productLink));
// 获取商品详情页面
Document productDoc = Jsoup.connect(productLink)
.userAgent("MyBot/1.0 (https://mywebsite.com; contact@mywebsite.com)")
.timeout(10000)
.get();
System.out.println(transientions("正在解析商品详情页面..."));
// 获取商品标题
String title = productDoc.select("h1.-fs20.-pts.-pbxs").text();
System.out.println(transientions("商品标题:" + title));
// 获取优惠前价格
String originalPrice = productDoc.select("span.-tal.-gy5.-lthr.-fs16.-pvxs.-ubpt").text();
if (originalPrice.isEmpty()) {
originalPrice = "无";
}
System.out.println(transientions("优惠前价格:" + originalPrice));
// 获取优惠后价格
String discountPrice = productDoc.select("span.-b.-ubpt.-tal.-fs24.-prxs").text();
System.out.println(transientions("优惠后价格:" + discountPrice));
// 获取商品数量
String stockInfo = productDoc.select("span.-fsh0.-prs.-fs12").text();
if (stockInfo.isEmpty()) {
stockInfo = productDoc.select("span.-df.-i-ctr.-fs12.-pbs.-yl7").text();
}
if (stockInfo.isEmpty()) {
stockInfo = "无";
}
System.out.println(transientions("商品数量:" + stockInfo));
// 获取商品特点
Elements features = productDoc.select("div.markup.-pam ul li");
System.out.println(transientions("商品特点:"));
for (Element feature : features) {
System.out.println(transientions(" - " + feature.text()));
}
// 获取商品规格
Elements specifications = productDoc.select("ul.-pvs.-mvxs.-phm.-lsn li");
System.out.println(transientions("商品规格:"));
for (Element spec : specifications) {
System.out.println(transientions(" - " + spec.text()));
}
// 获取商品描述和图片
Elements descriptionDiv = productDoc.select("div.markup.-mhm.-pvl.-oxa.-sc");
if (!descriptionDiv.isEmpty()) {
System.out.println(transientions("商品描述和图片:"));
// 优先处理 <p> 标签的内容
boolean descriptionFound = false;
for (Element desc : descriptionDiv.select("p")) {
String description = desc.text();
if (!description.isEmpty()) {
descriptionFound = true;
System.out.println(transientions(" 描述:" + description));
}
// 获取图片地址
Elements images = desc.select("img");
for (Element img : images) {
String imageUrl = img.attr("data-src");
if (imageUrl.isEmpty()) {
imageUrl = img.attr("src");
}
if (!imageUrl.isEmpty()) {
System.out.println(transientions(" 图片地址:" + imageUrl));
} else {
System.out.println(transientions(" 图片地址未找到!"));
}
}
}
// 如果没有从 <p> 中找到描述,尝试直接从 div 提取文本
if (!descriptionFound) {
String fallbackDescription = descriptionDiv.text();
if (!fallbackDescription.isEmpty()) {
System.out.println(transientions(" 直接提取描述:" + fallbackDescription));
} else {
System.out.println(transientions(" 描述未找到!"));
}
}
// 处理 <div> 内的图片
for (Element img : descriptionDiv.select("img")) {
String imageUrl = img.attr("data-src");
if (imageUrl.isEmpty()) {
imageUrl = img.attr("src");
}
if (!imageUrl.isEmpty()) {
System.out.println(transientions(" 图片地址:" + imageUrl));
} else {
System.out.println(transientions(" 图片地址未找到!"));
}
}
} else {
System.out.println(transientions("未找到商品描述!"));
}
}
// 暂停3秒
System.out.println("------------等待3秒,避免抓取过快,导致Ip被封------------");
Thread.sleep(3000);
}
System.out.println(transientions("商品信息解析完成!"));
} catch (Exception e) {
System.out.println(transientions("抓取过程中出现错误:" + e.getMessage()));
e.printStackTrace();
}
}
}
需要注意,这个网站是国外的,我打印的时候通过了翻译,翻译自行去写
温馨提示:
本文最后更新于
2024-11-30 20:57:59,某些文章具有时效性,若有错误或已失效,请在下方
留言。
© 版权声明
文章版权归作者所有,未经允许请勿转载。
THE END










暂无评论内容