Shopee台湾站API逆向分析:手把手教你用Java解析商品分类与列表数据
Shopee台湾站API数据解析实战Java实现商品分类与列表抓取技术1. 项目背景与合规说明在电商数据分析领域获取平台商品分类和列表信息是市场研究、竞品分析的基础工作。Shopee作为东南亚领先的电商平台其商品数据具有重要商业价值。但需要特别强调的是重要提示根据Shopee的robots.txt协议平台明确禁止任何形式的自动化数据抓取行为。本文仅作为技术研究案例演示如何通过合法API接口获取公开数据所有代码示例不得用于实际生产环境的数据采集。2. 技术准备与环境搭建2.1 开发环境要求JDK 1.8Maven 3.6开发工具IntelliJ IDEA或Eclipse依赖库dependencies dependency groupIdorg.jsoup/groupId artifactIdjsoup/artifactId version1.15.3/version /dependency dependency groupIdcom.alibaba/groupId artifactIdfastjson/artifactId version1.2.83/version /dependency /dependencies2.2 核心API接口分析Shopee台湾站主要提供两类关键API分类树APIGET https://xiapi.xiapibuy.com/api/v4/pages/get_category_tree返回包含一级、二级分类的完整树形结构商品列表APIGET https://xiapi.xiapibuy.com/api/v4/search/search_items参数说明fe_categoryids: 二级分类IDlimit: 每页数量(固定60)newest: 偏移量(页数-1)*60page_type: 固定searchscenario: 固定PAGE_OTHERS3. 核心代码实现3.1 分类数据解析public class ShopeeCategoryParser { private static final String CATEGORY_API https://xiapi.xiapibuy.com/api/v4/pages/get_category_tree; public ListCategory parseCategories() throws IOException { String json Jsoup.connect(CATEGORY_API) .ignoreContentType(true) .method(Method.GET) .execute() .body(); JSONArray categories JSON.parseObject(json) .getJSONObject(data) .getJSONArray(category_list); ListCategory result new ArrayList(); for(int i 0; i categories.size(); i) { JSONObject parent categories.getJSONObject(i); Category parentCat new Category( parent.getString(catid), parent.getString(name), parent.getString(display_name), 1 ); JSONArray children parent.getJSONArray(children); for(int j 0; j children.size(); j) { JSONObject child children.getJSONObject(j); Category childCat new Category( child.getString(catid), child.getString(name), child.getString(display_name), 2 ); parentCat.addChild(childCat); } result.add(parentCat); } return result; } } class Category { private String id; private String name; private String displayName; private int level; private ListCategory children; // 构造方法、getter/setter省略 }3.2 商品列表抓取public class ShopeeProductCrawler { private static final String PRODUCT_API_TEMPLATE https://xiapi.xiapibuy.com/api/v4/search/search_items?byrelevancy fe_categoryids%slimit60newest%d orderdescpage_typesearchscenarioPAGE_OTHERSversion2; public ListProduct crawlProducts(String categoryId, int maxPages) { ListProduct products new ArrayList(); for(int page 0; page maxPages; page) { int offset page * 60; String apiUrl String.format(PRODUCT_API_TEMPLATE, categoryId, offset); try { String json Jsoup.connect(apiUrl) .timeout(30000) .ignoreContentType(true) .execute() .body(); JSONArray items JSON.parseObject(json).getJSONArray(items); for(int i 0; i items.size(); i) { JSONObject item items.getJSONObject(i).getJSONObject(item_basic); Product product parseProduct(item); products.add(product); } // 如果返回数量不足60说明已到最后一页 if(items.size() 60) break; } catch (IOException e) { System.err.println(Error fetching page page : e.getMessage()); // 实现重试逻辑或记录错误 } } return products; } private Product parseProduct(JSONObject item) { Product product new Product(); product.setItemId(item.getString(itemid)); product.setName(item.getString(name)); product.setPrice(item.getLong(price) / 100000); // 价格转换 product.setHistoricalSold(item.getInteger(historical_sold)); // 其他字段解析... return product; } }3.3 完整执行流程public class ShopeeDataCollector { public static void main(String[] args) { ShopeeCategoryParser categoryParser new ShopeeCategoryParser(); ShopeeProductCrawler productCrawler new ShopeeProductCrawler(); try { // 1. 获取分类数据 ListCategory categories categoryParser.parseCategories(); // 2. 遍历分类抓取商品 for(Category parent : categories) { System.out.println(Processing category: parent.getDisplayName()); for(Category child : parent.getChildren()) { System.out.println(-- Subcategory: child.getDisplayName()); // 3. 抓取每个子分类下的商品(限制5页示例) ListProduct products productCrawler.crawlProducts(child.getId(), 5); // 4. 处理商品数据(存储/分析) processProducts(products); } } } catch (Exception e) { e.printStackTrace(); } } private static void processProducts(ListProduct products) { // 实现数据存储或分析逻辑 System.out.println(Collected products.size() products); } }4. 高级技巧与优化方案4.1 请求头优化Connection connection Jsoup.connect(url) .header(Accept, application/json) .header(Accept-Language, zh-TW) .header(User-Agent, Mozilla/5.0 (Windows NT 10.0; Win64; x64)) .ignoreContentType(true) .timeout(30000);4.2 分页控制策略策略优点缺点固定页数实现简单可能漏抓或空跑动态检测准确获取所有数据实现复杂增量抓取效率高需要记录状态推荐实现动态分页检测boolean hasMore true; int page 0; while(hasMore) { String json fetchPage(categoryId, page); JSONArray items parseItems(json); if(items.isEmpty() || items.size() 60) { hasMore false; } else { processItems(items); page; } }4.3 异常处理机制public String safeFetch(String url, int maxRetry) { int retry 0; while(retry maxRetry) { try { return Jsoup.connect(url) .timeout(15000) .ignoreContentType(true) .execute() .body(); } catch (SocketTimeoutException e) { retry; System.out.println(Timeout, retry retry); } catch (IOException e) { throw new RuntimeException(Fatal error, e); } } throw new RuntimeException(Max retry exceeded); }5. 数据结构设计5.1 分类数据结构public class Category { private String id; private String name; private String displayName; private int level; private ListCategory children; // 嵌套结构便于处理树形关系 public void addChild(Category child) { if(children null) { children new ArrayList(); } children.add(child); } }5.2 商品数据结构public class Product { private String itemId; private String shopId; private String name; private String currency; private long price; // 单位:分 private int historicalSold; private int stock; private float rating; private String image; private ListString images; private String description; private String categoryPath; // 可添加更多业务字段和方法 public double getPriceInDollar() { return price / 100.0; } }6. 性能优化方案6.1 多线程实现ExecutorService executor Executors.newFixedThreadPool(5); for(Category category : categories) { executor.submit(() - { ListProduct products crawler.crawlProducts(category.getId()); // 处理结果 }); } executor.shutdown(); executor.awaitTermination(1, TimeUnit.HOURS);6.2 缓存机制public class ApiCache { private static final MapString, String cache new ConcurrentHashMap(); public static String get(String url) throws IOException { if(cache.containsKey(url)) { return cache.get(url); } String data Jsoup.connect(url).ignoreContentType(true).execute().body(); cache.put(url, data); return data; } }6.3 请求间隔控制public class RateLimiter { private long lastRequestTime 0; private final long interval; // 毫秒 public RateLimiter(int requestsPerSecond) { this.interval 1000 / requestsPerSecond; } public void acquire() throws InterruptedException { long now System.currentTimeMillis(); long elapsed now - lastRequestTime; if(elapsed interval) { Thread.sleep(interval - elapsed); } lastRequestTime System.currentTimeMillis(); } } // 使用示例 RateLimiter limiter new RateLimiter(2); // 2次/秒 limiter.acquire(); String data Jsoup.connect(url).execute().body();7. 数据存储方案7.1 MySQL存储设计CREATE TABLE categories ( id VARCHAR(20) PRIMARY KEY, name VARCHAR(100), display_name VARCHAR(100), level TINYINT, parent_id VARCHAR(20), FOREIGN KEY (parent_id) REFERENCES categories(id) ); CREATE TABLE products ( item_id VARCHAR(20) PRIMARY KEY, shop_id VARCHAR(20), name VARCHAR(200), price DECIMAL(10,2), currency VARCHAR(3), historical_sold INT, stock INT, rating FLOAT, category_id VARCHAR(20), crawl_time DATETIME, FOREIGN KEY (category_id) REFERENCES categories(id) );7.2 使用JdbcTemplate批量插入Repository public class ProductRepository { Autowired private JdbcTemplate jdbc; public void batchInsert(ListProduct products) { String sql INSERT INTO products VALUES (?,?,?,?,?,?,?,?,?,NOW()) ON DUPLICATE KEY UPDATE priceVALUES(price), stockVALUES(stock); jdbc.batchUpdate(sql, new BatchPreparedStatementSetter() { public void setValues(PreparedStatement ps, int i) throws SQLException { Product p products.get(i); ps.setString(1, p.getItemId()); ps.setString(2, p.getShopId()); // 设置其他参数... } public int getBatchSize() { return products.size(); } }); } }8. 反爬虫应对策略虽然本文示例仅用于技术研究但实际商业抓取会遇到各种反爬措施IP限制- 使用代理IP池轮换public class ProxyManager { private ListProxy proxies; private AtomicInteger index new AtomicInteger(0); public Proxy getProxy() { int i index.getAndIncrement() % proxies.size(); return proxies.get(i); } }请求指纹检测- 模拟浏览器行为Connection connection Jsoup.connect(url) .userAgent(Mozilla/5.0 (Windows NT 10.0; Win64; x64)) .header(Accept-Language, zh-TW,zh;q0.9) .header(Accept-Encoding, gzip, deflate, br);验证码识别- 需要第三方服务支持再次强调任何绕过平台限制的抓取行为都可能违反服务条款请确保您的应用遵守目标网站的规定。