TVBox-java爬虫详解
SoEasy同学
编辑于 2024年03月02日 05:27
收录于文集
共11篇

本节目的:详解TVBox的spider爬虫的5个接口规范,以一个案例演示如何制作一个网站的爬虫源并测试。 示范工程以FongMi版爬虫仓库为例,https://github.com/FongMi/CatVodSpider

1.接口规范:

https://apifox.com/apidoc/shared-284a6632-f5d4-45f8-b796-d85605de5af4

2.在线格式化工具:

https://www.bejson.com/jsonviewernew/?tdsourcetag=s_pcqq_aiomsg

3.爬虫站点

策驰影视 https://www.algdts.com

备用线路:https://cechi16.com

4.分类目录:

https://www.algdts.com/index.php/home/vod/type-id-25-mcid-11-area-%E4%B8%AD%E5%9B%BD-year-2026-letter-A-order--picm-1-p-1 type_id: 分类 mcid: 类型 area: 地区 year: 年代 letter: 字母 order: 排序 pcim: 1 p: 页码

5.本地测试

用Android studio打开spider工程,在爬虫文件空白处,右键--go to--test,选择create new test,选择文件夹,以及需要测试的方法。 通过alt+回车,工具导包,或者在app/build.gradle里直接添加junit

注意jar包打包时,将junit的代码删除,导包也删除,避免报错

6.代码案例

代码块
JavaScript
自动换行
复制代码
package com.github.catvod.spider;

import android.content.Context;
import android.text.TextUtils;

import com.alibaba.fastjson.JSON;
import com.github.catvod.bean.Class;
import com.github.catvod.bean.Filter;
import com.github.catvod.bean.Result;
import com.github.catvod.bean.Vod;
import com.github.catvod.crawler.Spider;
import com.github.catvod.net.OkHttp;
import com.github.catvod.utils.Util;

import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class CeChi extends Spider {
    private static String siteUrl = "https://www.algdts.com";

    private Map<String, String> getHeader() {
        Map<String, String> header = new HashMap<>();
        header.put("User-Agent", Util.CHROME);
        return header;
    }

    @Override
    public void init(Context context, String extend) throws Exception {
        super.init(context, extend);
        if (!extend.isEmpty()) {
            siteUrl = extend;
        }
    }

    @Override
    public String homeContent(boolean filter) throws Exception {
        List<Class> classes = new ArrayList<>();
        List<String> typeIds = Arrays.asList("1", "2", "3", "4", "5");
        List<String> typeNames = Arrays.asList("电影", "电视剧", "动漫", "综艺", "微电影");
        for (int i = 0; i < typeIds.size(); i++) classes.add(new Class(typeIds.get(i), typeNames.get(i)));
        Document doc = Jsoup.parse(OkHttp.string(siteUrl, getHeader()));
        List<Vod> list = new ArrayList<>();
        for (Element li : doc.select("div.swiper-wrapper").eq(0).select("li")) {
            String vid = siteUrl + li.select(".pic-img").attr("href");
            String name = li.select(".pic-img").attr("title");
            String pic = li.select(".pic-img img").attr("data-original");
            String remark = li.select(".sname").text();
            list.add(new Vod(vid, name, pic, remark));
        }
        LinkedHashMap<String, List<Filter>> filters = new LinkedHashMap();
        for (int i = 0; i < typeIds.size(); i++) {
            String typeId = typeIds.get(i);
            String filterUrl = siteUrl + String.format("/index.php/home/vod/type-id-%s", typeId);
            Document filterDoc = Jsoup.parse(OkHttp.string(filterUrl, getHeader()));
            List<Filter> filterTemp = new ArrayList<>();
            for (Element ul : filterDoc.select("div.top-type div.container").eq(0).select("ul")) {
                String key = ul.id();
                if (key == "") key = "id";
                String name = ul.select("li").eq(0).text();
                List<Filter.Value> filterValues = new ArrayList<>();
                for (Element li : ul.select("li")) {
                    String n = li.select("a").text();
                    String v = li.select("a").attr("data");
                    if (n.equals(name)) continue;
                    filterValues.add(new Filter.Value(n, v));
                }
                filterTemp.add(new Filter(key, name, filterValues));
            }
            filters.put(typeIds.get(i), filterTemp);
        }
        return Result.string(classes, list, filters);
    }

    @Override
    public String categoryContent(String tid, String pg, boolean filter, HashMap<String, String> extend) throws Exception {
        HashMap<String, String> ext = new HashMap<>();
        if (extend != null && extend.size() > 0) ext.putAll(extend);
        String cateId = ext.get("id") == null ? "type-id-" + tid : "type-" + ext.get("id");
        String mcid = ext.get("mcid") == null ? "" : "-" + ext.get("mcid");
        String area = ext.get("area") == null ? "" : "-" + ext.get("area");
        String year = ext.get("year") == null ? "" : "-" + ext.get("year");
        String letter = ext.get("letter") == null ? "" : "-" + ext.get("letter");
        String order = "-order-";
        String picm = "-picm-1";
        String pgStr = "-p-" + pg;
        String cateUrl = siteUrl + String.format("/index.php/home/vod/%s%s%s%s%s%s%s%s", cateId, mcid, area, year, letter, order, picm, pgStr);
        Document doc = Jsoup.parse(OkHttp.string(cateUrl, getHeader()));
        List<Vod> list = new ArrayList<>();
        for (Element li : doc.select("div.layout-box ul.pic-list").eq(0).select("li")) {
            String vid = siteUrl + li.select("a").attr("href");
            String name = li.select("a").attr("title");
            String pic = li.select("img").attr("data-original");
            String remark = li.select(".sname").text();
            list.add(new Vod(vid, name, pic, remark));
        }
        return Result.string(list);
    }

    @Override
    public String detailContent(List<String> ids) throws Exception {
        Document doc = Jsoup.parse(OkHttp.string(ids.get(0), getHeader()));
        StringBuilder vod_play_url = new StringBuilder();
        StringBuilder vod_play_from = new StringBuilder();
        Elements titles = doc.select("div.layout-box").select("div.play-title").select("ul").eq(0).select("li");
        Elements lists = doc.select("div.layout-box").select("div.play-list").select("ul");
        for (int i = 0; i < titles.size(); i++) {
            Element li = titles.get(i);
            vod_play_from.append(li.select("a").text());
            if (i != titles.size() -1) {
                vod_play_from.append("$$$");
            }
        }
        for (int i = 0; i < lists.size(); i++) {
            Element ul = lists.get(i);
            Elements liList = ul.select("li");
            for (int j = 0; j < liList.size(); j++) {
                Element li = liList.get(j);
                String name = li.select("a").text();
                String url = siteUrl + li.select("a").attr("href");
                if (j != liList.size() - 1) {
                    vod_play_url.append(name).append("$").append(url).append("#");
                } else {
                    vod_play_url.append(name).append("$").append(url);
                }
            }
            if (i != lists.size() -1) {
                vod_play_url.append("$$$");
            }
        }
        String title = doc.select("h1.text-overflow").text();
        Elements videoInfo = doc.select("video-box").select("video-info").eq(1);

        String remark = "";
        String typeName = "";
        String actor = "";
        String director = "";
        String year = "";
        for (Element div :videoInfo.select("div")) {
            String span = div.select("span").text();
            String text = div.text();
            if (span.contains("状态")) {
                remark = text;
            }
            if (span.contains("类型")) {
                typeName = text;
            }
            if (span.contains("主演")) {
                actor = text;
            }
            if (span.contains("导演")) {
                director = text;
            }
            if (span.contains("年代")) {
                year = text;
            }
        }
        String brief = doc.select("div.article-content").select("p").text();
        Vod vod = new Vod();
        vod.setVodId(ids.get(0));
        vod.setVodYear(year);
        vod.setVodName(title);
        vod.setVodActor(actor);
        vod.setVodRemarks(remark);
        vod.setVodContent(brief);
        vod.setVodDirector(director);
        vod.setTypeName(typeName);
        vod.setVodPlayFrom(vod_play_from.toString());
        vod.setVodPlayUrl(vod_play_url.toString());
        return Result.string(vod);
    }

    @Override
    public String searchContent(String key, boolean quick) throws Exception {
        String searchUrl = siteUrl + "/search/" + URLEncoder.encode(key) + ".html";
        Document doc = Jsoup.parse(OkHttp.string(searchUrl, getHeader()));
        List<Vod> list = new ArrayList<>();
        for (Element li : doc.select("#content").select("li")) {
            String vid = siteUrl + li.select("div").eq(0).select("a").attr("href");
            String name = li.select("div").eq(1).select("a").attr("title");
            String pic = li.select("div").eq(0).select("a img").attr("data-original");
            String remark = li.select("div").eq(1).select("p").eq(2).text();
            list.add(new Vod(vid, name, pic, remark));
        }
        return Result.string(list);
    }


    @Override
    public String playerContent(String flag, String id, List<String> vipFlags) throws Exception {
        String content = OkHttp.string(id, getHeader());
        Matcher matcher = Pattern.compile("zanpiancms_player = (.*?);</script>").matcher(content);
        String json = matcher.find() ? matcher.group(1) : "";
        com.alibaba.fastjson.JSONObject parse = JSON.parseObject(json);
        String realUrl = parse.getString("url");
//        JSONObject player = new JSONObject().getJSONObject(json);
//        String realUrl = player.getString("url");
        return Result.get().url(realUrl).header(getHeader()).string();
    }
}
复制成功

7.更新代码

由于策驰网站已更新,需要同步更新代码:

代码块
clike
自动换行
复制代码
package com.github.catvod.spider;

import android.content.Context;

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.github.catvod.bean.Class;
import com.github.catvod.bean.Filter;
import com.github.catvod.bean.Result;
import com.github.catvod.bean.Vod;
import com.github.catvod.crawler.Spider;
import com.github.catvod.net.OkHttp;
import com.github.catvod.utils.Notify;
import com.github.catvod.utils.Util;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class CeChi extends Spider {
    private static String siteUrl = "https://cechi16.com";
    private static String cateUrl = "https://gda52.quelingfei.com/getsortdata_all_z.php?";
    private static String searchUrl = "https://v2.quelingfei.com/ssszz.php?q=";

    private Map<String, String> getHeader() {
        Map<String, String> header = new HashMap<>();
        header.put("User-Agent", Util.CHROME);
        return header;
    }

    @Override
    public void init(Context context, String extend) throws Exception {
        super.init(context, extend);
        if (!extend.isEmpty()) {
            siteUrl = extend;
        }
    }

    @Override
    public String homeContent(boolean filter) throws Exception {
        List<Class> classes = new ArrayList<>();
        List<String> typeIds = Arrays.asList("tv", "mov", "acg", "zongyi");
        List<String> typeNames = Arrays.asList("电视剧", "电影", "动漫", "综艺");
        for (int i = 0; i < typeIds.size(); i++) classes.add(new Class(typeIds.get(i), typeNames.get(i)));
        Document doc = Jsoup.parse(OkHttp.string(siteUrl, getHeader()));
        List<Vod> list = new ArrayList<>();
        for (Element li : doc.select("div.index-list-l").select("ul").select("li")) {
            String vid = siteUrl + "/" + li.select("a.li-hv").attr("href");
            String name = li.select("a.li-hv").attr("title");
            String pic = li.select("a.li-hv img").attr("data-original");
            String remark = li.select(".name").text();
            list.add(new Vod(vid, name, pic, remark));
        }
        LinkedHashMap<String, List<Filter>> filters = new LinkedHashMap();
        for (int i = 0; i < typeIds.size(); i++) {
            String typeId = typeIds.get(i);
            String filterUrl = siteUrl + String.format("/%s/0/0/all/1.html", typeId);
            Document filterDoc = Jsoup.parse(OkHttp.string(filterUrl, getHeader()));
            List<Filter> filterTemp = new ArrayList<>();
            for (Element dl : filterDoc.select("div.sy").eq(0).select("dl")) {
                String key = dl.select("dt").eq(0).text().trim().replace(":","");
                String name = dl.select("dt").eq(0).text().trim().replace(":","");
                List<Filter.Value> filterValues = new ArrayList<>();
                for (Element a : dl.select("dd").select("a")) {
                    String n = a.text();
                    String vList = a.attr("href");
                    String v = "";
                    if (name.contains("剧情")) {
                        v = vList.split("/")[3];
                    }
                    if (name.contains("年代")) {
                        v = vList.split("/")[2];
                    }
                    if (name.contains("地区")) {
                        v = vList.split("/")[4];
                    }
                    filterValues.add(new Filter.Value(n, v));
                }
                filterTemp.add(new Filter(key, name, filterValues));
            }
            filters.put(typeIds.get(i), filterTemp);
        }
        return Result.string(classes, list, filters);
    }

    @Override
    public String categoryContent(String tid, String pg, boolean filter, HashMap<String, String> extend) throws Exception {
        HashMap<String, String> ext = new HashMap<>();
        if (extend != null && extend.size() > 0) ext.putAll(extend);
        String cla = ext.get("按剧情") == null ? "0" : ext.get("按剧情");
        String area = ext.get("按地区") == null ? "all" : ext.get("按地区");
        String year = ext.get("按年代") == null ? "0" : ext.get("按年代");
        String cates =  cateUrl + String.format("action=%s&page=%s&year=%s&area=%s&class=%s&dect=0&id=", tid, pg, year,area, cla);
        Document doc = Jsoup.parse(OkHttp.string(cates, getHeader()));

        List<Vod> list = new ArrayList<>();
        for (Element li : doc.select("li")) {
            String vid = siteUrl + li.select("a").attr("href");
            String name = li.select("a").attr("title");
            String pic = li.select("img").attr("data-original");
            String remark = li.select(".name").text();
            list.add(new Vod(vid, name, pic, remark));
        }
        return Result.string(list);
    }

    @Override
    public String detailContent(List<String> ids) throws Exception {
        Document doc = Jsoup.parse(OkHttp.string(ids.get(0), getHeader()));
        StringBuilder vod_play_url = new StringBuilder();
        StringBuilder vod_play_from = new StringBuilder();
        Elements titles = doc.select("div.pfrom").select("ul").select("li[id^=tab]");
        Elements lists = doc.select("div.url").select("ul.urlli").select("ul");
        for (int i = 0; i < titles.size(); i++) {
            Element li = titles.get(i);
            vod_play_from.append(li.text());
            if (i != titles.size() -1) {
                vod_play_from.append("$$$");
            }
        }
        for (int i = 0; i < lists.size(); i++) {
            if (i + 1 > titles.size()) {
                continue;
            }
            Element ul = lists.get(i);
            Elements liList = ul.select("li");
            for (int j = 0; j < liList.size(); j++) {
                Element li = liList.get(j);
                String name = li.select("a").text();
                String url = siteUrl + li.select("a").attr("href");
                if (j != liList.size() - 1) {
                    vod_play_url.append(name).append("$").append(url).append("#");
                } else {
                    vod_play_url.append(name).append("$").append(url);
                }
            }
            if (i != titles.size() -1) {
                vod_play_url.append("$$$");
            }
        }
        String title = doc.select("dt.name").text();
        Elements videoInfo = doc.select("div.info").eq(0).select("dl").select("dd");

        String remark = "";
        String typeName = "";
        String actor = "";
        String director = "";
        String year = "";
        for (int j = 0; j < videoInfo.size(); j++) {
            Element div = videoInfo.get(j);
            String span = div.select("b").text();
            String text = div.text();
            if (span.contains("状态")) {
                remark = text;
            }
            if (span.contains("类型")) {
                typeName = text;
            }
            if (span.contains("主演")) {
                actor = text;
            }
            if (span.contains("导演")) {
                director = text;
            }
            if (span.contains("年代")) {
                year = text;
            }
        }
        String brief = doc.select("div.article-content").select("p").text();
        Vod vod = new Vod();
        vod.setVodId(ids.get(0));
        vod.setVodYear(year);
        vod.setVodName(title);
        vod.setVodActor(actor);
        vod.setVodRemarks(remark);
        vod.setVodContent(brief);
        vod.setVodDirector(director);
        vod.setTypeName(typeName);
        vod.setVodPlayFrom(vod_play_from.toString());
        vod.setVodPlayUrl(vod_play_url.toString());
        return Result.string(vod);
    }

    @Override
    public String searchContent(String key, boolean quick) throws Exception {
        String url = searchUrl + URLEncoder.encode(key);
        String json = OkHttp.string(url, getHeader());
        JSONArray objects = JSONArray.parseArray(json);
        List<Vod> list = new ArrayList<>();
        for (int j = 0; j < objects.size(); j++) {
            JSONObject vod = (JSONObject)objects.get(j);
            String vid = siteUrl + vod.getString("url");
            String name = vod.getString("title");
            String pic = vod.getString("thumb");
            String remark = vod.getString("area");
            list.add(new Vod(vid, name, pic, remark));
        }
        return Result.string(list);
    }


    @Override
    public String playerContent(String flag, String id, List<String> vipFlags) throws Exception {
        Matcher matcher0 = Pattern.compile("/(\\d+).html").matcher(id);
        String pid = matcher0.find() ? matcher0.group(1) : "";
        String content = OkHttp.string(id, getHeader());
        Matcher matcher1 = Pattern.compile("src=\"(.*?)\"></script><body>").matcher(content);
        String js = matcher1.find() ? matcher1.group(1) : "";
        String urls = OkHttp.string(js, getHeader());
        Matcher matcher2 = Pattern.compile("playarr_(.*?)\\[" + pid + "\\]=\"(.*?.m3u8),-1,").matcher(urls);
        String realUrl = matcher2.find() ? matcher2.group(2) : "";
        return Result.get().url(realUrl).header(getHeader()).string();
    }
}
复制成功

测试文件:

代码块
clike
自动换行
复制代码
package com.github.catvod.spider;

import static org.junit.jupiter.api.Assertions.*;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;

class CeChiTest {

    @org.junit.jupiter.api.Test
    void init() {
    }

    @org.junit.jupiter.api.Test
    void homeContent() throws Exception {
        String s = new CeChi().homeContent(false);
        System.out.println(s);
    }

    @org.junit.jupiter.api.Test
    void categoryContent() throws Exception {
        String s = new CeChi().categoryContent("mov", "1", false, new HashMap<>());
        System.out.println(s);
    }

    @org.junit.jupiter.api.Test
    void detailContent() throws Exception {
        String s = new CeChi().detailContent(Arrays.asList("https://cechi16.com/tv/88868/"));
        System.out.println(s);
    }

    @org.junit.jupiter.api.Test
    void searchContent() throws Exception {
        String s = new CeChi().searchContent("斗罗大陆",false);
        System.out.println(s);
    }

    @org.junit.jupiter.api.Test
    void playerContent() throws Exception {
        String s = new CeChi().playerContent("", "https://cechi16.com/tv/88868/40.html", new ArrayList<>());
        System.out.println(s);
    }
}
复制成功

build.gradle中引入了新的jar:

代码块
clike
自动换行
复制代码
    implementation 'com.alibaba:fastjson:1.2.76'
复制成功