php 爬虫 动态渲染JS渲染数据抓取 QueryList

in 日常随笔 with 0 comment 访问: 492 次

jaeger/querylist爬虫工具

官方文档 https://querylist.cc/docs/guide/v4/PhantomJS

// 基本功能包
composer require jaeger/querylist
// JS动态渲染网页爬取插件(抓取动态渲染网页还需要下载工具:https://phantomjs.org/download.html)
composer require jaeger/querylist-phantomjs
    $url = 'www.litblc.com';    // 抓取网页地址
    $phantomPath = 'E:/githubShyzhen/FakePHP/phantomjs-2.1.1-windows/bin/phantomjs.exe';    // 下载的工具路径
    $ql = QueryList::getInstance();
    $ql->use(PhantomJs::class, $phantomPath);
    $html = $ql->browser($url)->getHtml();
    $dom = QueryList::html($html);
    
    $dom->find('.title-name')->text();
    ...

示例代码

    public function spader()
    {
        $this->handleSpader(105);
    }

    public function handleSpader($id)
    {
        $url = 'https://pvp.qq.com/web201605/herodetail/'.$id.'.shtml';
        $ql = QueryList::getInstance();
        $ql->use(PhantomJs::class,'E:/githubShyzhen/FakePHP/phantomjs-2.1.1-windows/bin/phantomjs.exe');
        $html = $ql->browser($url)->getHtml();

        $dom = QueryList::html($html);

        $mingTips = $dom->find('.sugg-tips')->text();
        $equipTips = $dom->find('.equip-tips')->eq(0)->text();


        // ming JSON
        $ming1Ids = $dom->find('.sugg-u1')->attr('data-ming');
        $tempIds = explode('|', $ming1Ids);
        $ming1Id = $tempIds[0];
        $ming2Id = $tempIds[1];
        $ming3Id = $tempIds[2];

        $ming1 = $dom->find('.sugg-u1 li')->eq(0);
        $ming2 = $dom->find('.sugg-u1 li')->eq(1);
        $ming3 = $dom->find('.sugg-u1 li')->eq(2);


        $ming1Name = $ming1->find('p')->eq(0)->text();
        $ming1Intro1 = $ming1->find('p')->eq(1)->text();
        $ming1Intro2 = $ming1->find('p')->eq(2)->text();
        $ming1Intro3 = $ming1->find('p')->eq(3)->text();


        $ming2Name = $ming2->find('p')->eq(0)->text();
        $ming2Intro1 = $ming2->find('p')->eq(1)->text();
        $ming2Intro2 = $ming2->find('p')->eq(2)->text();
        $ming2Intro3 = $ming2->find('p')->eq(3)->text();

        $ming3Name = $ming3->find('p')->eq(0)->text();
        $ming3Intro1 = $ming3->find('p')->eq(1)->text();
        $ming3Intro2 = $ming3->find('p')->eq(2)->text();
        $ming3Intro3 = $ming3->find('p')->eq(3)->text();

        $mingRes = [
            ['id' => $ming1Id, 'name' => $ming1Name, 'intro' => trim(implode('|', [$ming1Intro1, $ming1Intro2, $ming1Intro3]), '|')],
            ['id' => $ming2Id, 'name' => $ming2Name, 'intro' => trim(implode('|', [$ming2Intro1, $ming2Intro2, $ming2Intro3]), '|')],
            ['id' => $ming3Id, 'name' => $ming3Name, 'intro' => trim(implode('|', [$ming3Intro1, $ming3Intro2, $ming3Intro3]), '|')],
        ];
        $mingJson = json_encode($mingRes, JSON_UNESCAPED_UNICODE);


        // equipment JSON
        $equipmentDom = $dom->find('.equip-list')->eq(0);
        $eIdStr = $equipmentDom->attr('data-item');
        $eIds = explode('|', $eIdStr);
        $e1Id = $eIds[0];
        $e2Id = $eIds[1];
        $e3Id = $eIds[2];
        $e4Id = $eIds[3];
        $e5Id = $eIds[4];
        $e6Id = $eIds[5];

        $e1Name = $equipmentDom->find('#Jname')->eq(0)->text();
        $e2Name = $equipmentDom->find('#Jname')->eq(1)->text();
        $e3Name = $equipmentDom->find('#Jname')->eq(2)->text();
        $e4Name = $equipmentDom->find('#Jname')->eq(3)->text();
        $e5Name = $equipmentDom->find('#Jname')->eq(4)->text();
        $e6Name = $equipmentDom->find('#Jname')->eq(5)->text();

        $eRes = [
            ['id' => $e1Id, 'name' => $e1Name, 'intro' => ''],
            ['id' => $e2Id, 'name' => $e2Name, 'intro' => ''],
            ['id' => $e3Id, 'name' => $e3Name, 'intro' => ''],
            ['id' => $e4Id, 'name' => $e4Name, 'intro' => ''],
            ['id' => $e5Id, 'name' => $e5Name, 'intro' => ''],
            ['id' => $e6Id, 'name' => $e6Name, 'intro' => ''],
        ];
        $eJson = json_encode($eRes, JSON_UNESCAPED_UNICODE);


        // counterHero JSON
        $heroDom = $dom->find('.hero-info-box')->find('.hero-info')->eq(1);
        $h1Id = $heroDom->find('img')->eq(0)->src;
        $h2Id = $heroDom->find('img')->eq(1)->src;
        $h1Intro = $heroDom->find('.hero-list-desc')->find('p')->eq(0)->text();
        $h2Intro = $heroDom->find('.hero-list-desc')->find('p')->eq(1)->text();

        $id1 = substr($h1Id, strripos($h1Id, '/') + 1, strripos($h1Id, '.') - strripos($h1Id, '/') - 1);
        $id2 = substr($h2Id, strripos($h2Id, '/') + 1, strripos($h2Id, '.') - strripos($h2Id, '/') - 1);
        $heroRes = [
            ['id' => $id1, 'name' => $this->handleHeroName($id1), 'intro' => $h1Intro],
            ['id' => $id2, 'name' => $this->handleHeroName($id2), 'intro' => $h2Intro],
        ];
        $heroJson = json_encode($heroRes, JSON_UNESCAPED_UNICODE);

        $resHeroId = $id;
        $resMing = $mingJson;
        $resMingTips = $mingTips;
        $resEquipment = $eJson;
        $resEtips = $equipTips;
        $resCh = $heroJson;

        // 拼装sql
        $sql = "INSERT INTO `wangzhe_hero_tutorial` (`hero_id`,`ming`,`ming_tips`,`equipment`,`equipment_tips`,`counter_hero`, `created_at`, `updated_at`) VALUES ('$resHeroId', '$resMing', '$resMingTips', '$resEquipment', '$resEtips', '$resCh', '2022-03-29 16:29:53', '2022-03-29 16:29:53');";

        echo $sql;

        exit;
    }

    public function handleHeroName($heroId)
    {
        $json = '{"105": "廉颇","106": "小乔"}';
        $heroArr = json_decode($json, true);
        return $heroArr[$heroId];
    }

赞赏支持
Responses