本节目的:详解TVBox的spider爬虫的5个接口规范,以一个案例演示如何制作一个网站的爬虫源并测试。 示范工程以FongMi版爬虫仓库为例,https://github.com/FongMi/CatVodSpider
https://apifox.com/apidoc/shared-284a6632-f5d4-45f8-b796-d85605de5af4
https://www.bejson.com/jsonviewernew/?tdsourcetag=s_pcqq_aiomsg
策驰影视 https://www.algdts.com
备用线路:https://cechi16.com
https://www.algdts.com/index.php/home/vod/type-id-25-mcid-11-area-%E4%B8%AD%E5%9B%BD-year-2026-letter-A-order--picm-1-p-1 type_id: 分类 mcid: 类型 area: 地区 year: 年代 letter: 字母 order: 排序 pcim: 1 p: 页码
用Android studio打开spider工程,在爬虫文件空白处,右键--go to--test,选择create new test,选择文件夹,以及需要测试的方法。 通过alt+回车,工具导包,或者在app/build.gradle里直接添加junit
注意jar包打包时,将junit的代码删除,导包也删除,避免报错
package com.github.catvod.spider;
import android.content.Context;
import android.text.TextUtils;
import com.alibaba.fastjson.JSON;
import com.github.catvod.bean.Class;
import com.github.catvod.bean.Filter;
import com.github.catvod.bean.Result;
import com.github.catvod.bean.Vod;
import com.github.catvod.crawler.Spider;
import com.github.catvod.net.OkHttp;
import com.github.catvod.utils.Util;
import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class CeChi extends Spider {
private static String siteUrl = "https://www.algdts.com";
private Map<String, String> getHeader() {
Map<String, String> header = new HashMap<>();
header.put("User-Agent", Util.CHROME);
return header;
}
@Override
public void init(Context context, String extend) throws Exception {
super.init(context, extend);
if (!extend.isEmpty()) {
siteUrl = extend;
}
}
@Override
public String homeContent(boolean filter) throws Exception {
List<Class> classes = new ArrayList<>();
List<String> typeIds = Arrays.asList("1", "2", "3", "4", "5");
List<String> typeNames = Arrays.asList("电影", "电视剧", "动漫", "综艺", "微电影");
for (int i = 0; i < typeIds.size(); i++) classes.add(new Class(typeIds.get(i), typeNames.get(i)));
Document doc = Jsoup.parse(OkHttp.string(siteUrl, getHeader()));
List<Vod> list = new ArrayList<>();
for (Element li : doc.select("div.swiper-wrapper").eq(0).select("li")) {
String vid = siteUrl + li.select(".pic-img").attr("href");
String name = li.select(".pic-img").attr("title");
String pic = li.select(".pic-img img").attr("data-original");
String remark = li.select(".sname").text();
list.add(new Vod(vid, name, pic, remark));
}
LinkedHashMap<String, List<Filter>> filters = new LinkedHashMap();
for (int i = 0; i < typeIds.size(); i++) {
String typeId = typeIds.get(i);
String filterUrl = siteUrl + String.format("/index.php/home/vod/type-id-%s", typeId);
Document filterDoc = Jsoup.parse(OkHttp.string(filterUrl, getHeader()));
List<Filter> filterTemp = new ArrayList<>();
for (Element ul : filterDoc.select("div.top-type div.container").eq(0).select("ul")) {
String key = ul.id();
if (key == "") key = "id";
String name = ul.select("li").eq(0).text();
List<Filter.Value> filterValues = new ArrayList<>();
for (Element li : ul.select("li")) {
String n = li.select("a").text();
String v = li.select("a").attr("data");
if (n.equals(name)) continue;
filterValues.add(new Filter.Value(n, v));
}
filterTemp.add(new Filter(key, name, filterValues));
}
filters.put(typeIds.get(i), filterTemp);
}
return Result.string(classes, list, filters);
}
@Override
public String categoryContent(String tid, String pg, boolean filter, HashMap<String, String> extend) throws Exception {
HashMap<String, String> ext = new HashMap<>();
if (extend != null && extend.size() > 0) ext.putAll(extend);
String cateId = ext.get("id") == null ? "type-id-" + tid : "type-" + ext.get("id");
String mcid = ext.get("mcid") == null ? "" : "-" + ext.get("mcid");
String area = ext.get("area") == null ? "" : "-" + ext.get("area");
String year = ext.get("year") == null ? "" : "-" + ext.get("year");
String letter = ext.get("letter") == null ? "" : "-" + ext.get("letter");
String order = "-order-";
String picm = "-picm-1";
String pgStr = "-p-" + pg;
String cateUrl = siteUrl + String.format("/index.php/home/vod/%s%s%s%s%s%s%s%s", cateId, mcid, area, year, letter, order, picm, pgStr);
Document doc = Jsoup.parse(OkHttp.string(cateUrl, getHeader()));
List<Vod> list = new ArrayList<>();
for (Element li : doc.select("div.layout-box ul.pic-list").eq(0).select("li")) {
String vid = siteUrl + li.select("a").attr("href");
String name = li.select("a").attr("title");
String pic = li.select("img").attr("data-original");
String remark = li.select(".sname").text();
list.add(new Vod(vid, name, pic, remark));
}
return Result.string(list);
}
@Override
public String detailContent(List<String> ids) throws Exception {
Document doc = Jsoup.parse(OkHttp.string(ids.get(0), getHeader()));
StringBuilder vod_play_url = new StringBuilder();
StringBuilder vod_play_from = new StringBuilder();
Elements titles = doc.select("div.layout-box").select("div.play-title").select("ul").eq(0).select("li");
Elements lists = doc.select("div.layout-box").select("div.play-list").select("ul");
for (int i = 0; i < titles.size(); i++) {
Element li = titles.get(i);
vod_play_from.append(li.select("a").text());
if (i != titles.size() -1) {
vod_play_from.append("$$$");
}
}
for (int i = 0; i < lists.size(); i++) {
Element ul = lists.get(i);
Elements liList = ul.select("li");
for (int j = 0; j < liList.size(); j++) {
Element li = liList.get(j);
String name = li.select("a").text();
String url = siteUrl + li.select("a").attr("href");
if (j != liList.size() - 1) {
vod_play_url.append(name).append("$").append(url).append("#");
} else {
vod_play_url.append(name).append("$").append(url);
}
}
if (i != lists.size() -1) {
vod_play_url.append("$$$");
}
}
String title = doc.select("h1.text-overflow").text();
Elements videoInfo = doc.select("video-box").select("video-info").eq(1);
String remark = "";
String typeName = "";
String actor = "";
String director = "";
String year = "";
for (Element div :videoInfo.select("div")) {
String span = div.select("span").text();
String text = div.text();
if (span.contains("状态")) {
remark = text;
}
if (span.contains("类型")) {
typeName = text;
}
if (span.contains("主演")) {
actor = text;
}
if (span.contains("导演")) {
director = text;
}
if (span.contains("年代")) {
year = text;
}
}
String brief = doc.select("div.article-content").select("p").text();
Vod vod = new Vod();
vod.setVodId(ids.get(0));
vod.setVodYear(year);
vod.setVodName(title);
vod.setVodActor(actor);
vod.setVodRemarks(remark);
vod.setVodContent(brief);
vod.setVodDirector(director);
vod.setTypeName(typeName);
vod.setVodPlayFrom(vod_play_from.toString());
vod.setVodPlayUrl(vod_play_url.toString());
return Result.string(vod);
}
@Override
public String searchContent(String key, boolean quick) throws Exception {
String searchUrl = siteUrl + "/search/" + URLEncoder.encode(key) + ".html";
Document doc = Jsoup.parse(OkHttp.string(searchUrl, getHeader()));
List<Vod> list = new ArrayList<>();
for (Element li : doc.select("#content").select("li")) {
String vid = siteUrl + li.select("div").eq(0).select("a").attr("href");
String name = li.select("div").eq(1).select("a").attr("title");
String pic = li.select("div").eq(0).select("a img").attr("data-original");
String remark = li.select("div").eq(1).select("p").eq(2).text();
list.add(new Vod(vid, name, pic, remark));
}
return Result.string(list);
}
@Override
public String playerContent(String flag, String id, List<String> vipFlags) throws Exception {
String content = OkHttp.string(id, getHeader());
Matcher matcher = Pattern.compile("zanpiancms_player = (.*?);</script>").matcher(content);
String json = matcher.find() ? matcher.group(1) : "";
com.alibaba.fastjson.JSONObject parse = JSON.parseObject(json);
String realUrl = parse.getString("url");
// JSONObject player = new JSONObject().getJSONObject(json);
// String realUrl = player.getString("url");
return Result.get().url(realUrl).header(getHeader()).string();
}
}
7.更新代码
由于策驰网站已更新,需要同步更新代码:
package com.github.catvod.spider;
import android.content.Context;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.github.catvod.bean.Class;
import com.github.catvod.bean.Filter;
import com.github.catvod.bean.Result;
import com.github.catvod.bean.Vod;
import com.github.catvod.crawler.Spider;
import com.github.catvod.net.OkHttp;
import com.github.catvod.utils.Notify;
import com.github.catvod.utils.Util;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class CeChi extends Spider {
private static String siteUrl = "https://cechi16.com";
private static String cateUrl = "https://gda52.quelingfei.com/getsortdata_all_z.php?";
private static String searchUrl = "https://v2.quelingfei.com/ssszz.php?q=";
private Map<String, String> getHeader() {
Map<String, String> header = new HashMap<>();
header.put("User-Agent", Util.CHROME);
return header;
}
@Override
public void init(Context context, String extend) throws Exception {
super.init(context, extend);
if (!extend.isEmpty()) {
siteUrl = extend;
}
}
@Override
public String homeContent(boolean filter) throws Exception {
List<Class> classes = new ArrayList<>();
List<String> typeIds = Arrays.asList("tv", "mov", "acg", "zongyi");
List<String> typeNames = Arrays.asList("电视剧", "电影", "动漫", "综艺");
for (int i = 0; i < typeIds.size(); i++) classes.add(new Class(typeIds.get(i), typeNames.get(i)));
Document doc = Jsoup.parse(OkHttp.string(siteUrl, getHeader()));
List<Vod> list = new ArrayList<>();
for (Element li : doc.select("div.index-list-l").select("ul").select("li")) {
String vid = siteUrl + "/" + li.select("a.li-hv").attr("href");
String name = li.select("a.li-hv").attr("title");
String pic = li.select("a.li-hv img").attr("data-original");
String remark = li.select(".name").text();
list.add(new Vod(vid, name, pic, remark));
}
LinkedHashMap<String, List<Filter>> filters = new LinkedHashMap();
for (int i = 0; i < typeIds.size(); i++) {
String typeId = typeIds.get(i);
String filterUrl = siteUrl + String.format("/%s/0/0/all/1.html", typeId);
Document filterDoc = Jsoup.parse(OkHttp.string(filterUrl, getHeader()));
List<Filter> filterTemp = new ArrayList<>();
for (Element dl : filterDoc.select("div.sy").eq(0).select("dl")) {
String key = dl.select("dt").eq(0).text().trim().replace(":","");
String name = dl.select("dt").eq(0).text().trim().replace(":","");
List<Filter.Value> filterValues = new ArrayList<>();
for (Element a : dl.select("dd").select("a")) {
String n = a.text();
String vList = a.attr("href");
String v = "";
if (name.contains("剧情")) {
v = vList.split("/")[3];
}
if (name.contains("年代")) {
v = vList.split("/")[2];
}
if (name.contains("地区")) {
v = vList.split("/")[4];
}
filterValues.add(new Filter.Value(n, v));
}
filterTemp.add(new Filter(key, name, filterValues));
}
filters.put(typeIds.get(i), filterTemp);
}
return Result.string(classes, list, filters);
}
@Override
public String categoryContent(String tid, String pg, boolean filter, HashMap<String, String> extend) throws Exception {
HashMap<String, String> ext = new HashMap<>();
if (extend != null && extend.size() > 0) ext.putAll(extend);
String cla = ext.get("按剧情") == null ? "0" : ext.get("按剧情");
String area = ext.get("按地区") == null ? "all" : ext.get("按地区");
String year = ext.get("按年代") == null ? "0" : ext.get("按年代");
String cates = cateUrl + String.format("action=%s&page=%s&year=%s&area=%s&class=%s&dect=0&id=", tid, pg, year,area, cla);
Document doc = Jsoup.parse(OkHttp.string(cates, getHeader()));
List<Vod> list = new ArrayList<>();
for (Element li : doc.select("li")) {
String vid = siteUrl + li.select("a").attr("href");
String name = li.select("a").attr("title");
String pic = li.select("img").attr("data-original");
String remark = li.select(".name").text();
list.add(new Vod(vid, name, pic, remark));
}
return Result.string(list);
}
@Override
public String detailContent(List<String> ids) throws Exception {
Document doc = Jsoup.parse(OkHttp.string(ids.get(0), getHeader()));
StringBuilder vod_play_url = new StringBuilder();
StringBuilder vod_play_from = new StringBuilder();
Elements titles = doc.select("div.pfrom").select("ul").select("li[id^=tab]");
Elements lists = doc.select("div.url").select("ul.urlli").select("ul");
for (int i = 0; i < titles.size(); i++) {
Element li = titles.get(i);
vod_play_from.append(li.text());
if (i != titles.size() -1) {
vod_play_from.append("$$$");
}
}
for (int i = 0; i < lists.size(); i++) {
if (i + 1 > titles.size()) {
continue;
}
Element ul = lists.get(i);
Elements liList = ul.select("li");
for (int j = 0; j < liList.size(); j++) {
Element li = liList.get(j);
String name = li.select("a").text();
String url = siteUrl + li.select("a").attr("href");
if (j != liList.size() - 1) {
vod_play_url.append(name).append("$").append(url).append("#");
} else {
vod_play_url.append(name).append("$").append(url);
}
}
if (i != titles.size() -1) {
vod_play_url.append("$$$");
}
}
String title = doc.select("dt.name").text();
Elements videoInfo = doc.select("div.info").eq(0).select("dl").select("dd");
String remark = "";
String typeName = "";
String actor = "";
String director = "";
String year = "";
for (int j = 0; j < videoInfo.size(); j++) {
Element div = videoInfo.get(j);
String span = div.select("b").text();
String text = div.text();
if (span.contains("状态")) {
remark = text;
}
if (span.contains("类型")) {
typeName = text;
}
if (span.contains("主演")) {
actor = text;
}
if (span.contains("导演")) {
director = text;
}
if (span.contains("年代")) {
year = text;
}
}
String brief = doc.select("div.article-content").select("p").text();
Vod vod = new Vod();
vod.setVodId(ids.get(0));
vod.setVodYear(year);
vod.setVodName(title);
vod.setVodActor(actor);
vod.setVodRemarks(remark);
vod.setVodContent(brief);
vod.setVodDirector(director);
vod.setTypeName(typeName);
vod.setVodPlayFrom(vod_play_from.toString());
vod.setVodPlayUrl(vod_play_url.toString());
return Result.string(vod);
}
@Override
public String searchContent(String key, boolean quick) throws Exception {
String url = searchUrl + URLEncoder.encode(key);
String json = OkHttp.string(url, getHeader());
JSONArray objects = JSONArray.parseArray(json);
List<Vod> list = new ArrayList<>();
for (int j = 0; j < objects.size(); j++) {
JSONObject vod = (JSONObject)objects.get(j);
String vid = siteUrl + vod.getString("url");
String name = vod.getString("title");
String pic = vod.getString("thumb");
String remark = vod.getString("area");
list.add(new Vod(vid, name, pic, remark));
}
return Result.string(list);
}
@Override
public String playerContent(String flag, String id, List<String> vipFlags) throws Exception {
Matcher matcher0 = Pattern.compile("/(\\d+).html").matcher(id);
String pid = matcher0.find() ? matcher0.group(1) : "";
String content = OkHttp.string(id, getHeader());
Matcher matcher1 = Pattern.compile("src=\"(.*?)\"></script><body>").matcher(content);
String js = matcher1.find() ? matcher1.group(1) : "";
String urls = OkHttp.string(js, getHeader());
Matcher matcher2 = Pattern.compile("playarr_(.*?)\\[" + pid + "\\]=\"(.*?.m3u8),-1,").matcher(urls);
String realUrl = matcher2.find() ? matcher2.group(2) : "";
return Result.get().url(realUrl).header(getHeader()).string();
}
} 测试文件:
package com.github.catvod.spider;
import static org.junit.jupiter.api.Assertions.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
class CeChiTest {
@org.junit.jupiter.api.Test
void init() {
}
@org.junit.jupiter.api.Test
void homeContent() throws Exception {
String s = new CeChi().homeContent(false);
System.out.println(s);
}
@org.junit.jupiter.api.Test
void categoryContent() throws Exception {
String s = new CeChi().categoryContent("mov", "1", false, new HashMap<>());
System.out.println(s);
}
@org.junit.jupiter.api.Test
void detailContent() throws Exception {
String s = new CeChi().detailContent(Arrays.asList("https://cechi16.com/tv/88868/"));
System.out.println(s);
}
@org.junit.jupiter.api.Test
void searchContent() throws Exception {
String s = new CeChi().searchContent("斗罗大陆",false);
System.out.println(s);
}
@org.junit.jupiter.api.Test
void playerContent() throws Exception {
String s = new CeChi().playerContent("", "https://cechi16.com/tv/88868/40.html", new ArrayList<>());
System.out.println(s);
}
} build.gradle中引入了新的jar:
implementation 'com.alibaba:fastjson:1.2.76'