HEX
Server: nginx/1.28.1
System: Linux 10-41-63-61 6.8.0-31-generic #31-Ubuntu SMP PREEMPT_DYNAMIC Sat Apr 20 00:40:06 UTC 2024 x86_64
User: www (1001)
PHP: 7.4.33
Disabled: passthru,exec,system,putenv,chroot,chgrp,chown,shell_exec,popen,proc_open,pcntl_exec,ini_alter,ini_restore,dl,openlog,syslog,readlink,symlink,popepassthru,pcntl_alarm,pcntl_fork,pcntl_waitpid,pcntl_wait,pcntl_wifexited,pcntl_wifstopped,pcntl_wifsignaled,pcntl_wifcontinued,pcntl_wexitstatus,pcntl_wtermsig,pcntl_wstopsig,pcntl_signal,pcntl_signal_dispatch,pcntl_get_last_error,pcntl_strerror,pcntl_sigprocmask,pcntl_sigwaitinfo,pcntl_sigtimedwait,pcntl_exec,pcntl_getpriority,pcntl_setpriority,imap_open,apache_setenv
Upload Files
File: /www/wwwroot/crm.jmfdbn.com/application/collection/event/Collection.php
<?php
// +----------------------------------------------------------------------
// | Yzncms [ 御宅男工作室 ]
// +----------------------------------------------------------------------
// | Copyright (c) 2018 http://yzncms.com All rights reserved.
// +----------------------------------------------------------------------
// | Licensed ( http://www.apache.org/licenses/LICENSE-2.0 )
// +----------------------------------------------------------------------
// | Author: 御宅男 <530765310@qq.com>
// +----------------------------------------------------------------------

// +----------------------------------------------------------------------
// | 采集管理
// +----------------------------------------------------------------------
namespace app\collection\event;

use QL\QueryList;

class Collection
{
    public $_config;
    public $_url;
    public function init($config)
    {
        $this->_config = $config;
    }

    //得到需要采集的网页列表页
    public function url_list($num = '')
    {
        $url = array();
        switch ($this->_config['sourcetype']) {
            case '1': //序列化
                $num = empty($num) ? $this->_config['pagesize_end'] : $num;
                if ($num < $this->_config['pagesize_start']) {
                    $num = $this->_config['pagesize_start'];
                }
                for ($i = $this->_config['pagesize_start']; $i <= $num; $i = $i + $this->_config['par_num']) {
                    $url[$i] = str_replace('(*)', $i, $this->_config['urlpage']);
                }
                break;
            case '2': //多网址
                $url = explode("\r\n", $this->_config['urlpage']);
                break;
            case '3': //单一网址
            case '4': //RSS
                $url[] = $this->_config['urlpage'];
                break;
        }
        return $url;
    }

    //获取文章网址
    public function get_url_lists($url)
    {
        if ($url) {
            // 定义采集规则
            $rules = [
                'url' => [$this->_config['url_rule1'], $this->_config['url_rule2'], $this->_config['url_rule3']],
                'title' => [$this->_config['url_rule1'], 'html', $this->_config['url_rule3']],
            ];
            if ('utf-8' == $this->_config['sourcecharset']) {
                $list = QueryList::get($url);
            } else {
                $list = QueryList::get($url)->removeHead()->encoding('UTF-8');
            }
            $list = $list->rules($rules)->query()->getData();
            $data = array();
            foreach ($list as $k => $v) {
                if ($this->_config['url_contain']) {
                    if (strpos($v['url'], $this->_config['url_contain']) === false) {
                        continue;
                    }
                }
                if ($this->_config['url_except']) {
                    if (strpos($v['url'], $this->_config['url_except']) !== false) {
                        continue;
                    }
                }
                $data[$k]['url'] = $this->url_check($v['url'], $url);
                $data[$k]['title'] = strip_tags($v['title']);
            }
            return $data;
        } else {
            return false;
        }
    }

    //采集内容
    public function get_content($url)
    {
        $this->_url = $url;
        foreach ($this->_config['customize_config'] as $k => $v) {
            if (empty($v['name'])) {
                continue;
            }
            $rules[$v['name']] = [$v['selector'], $v['attr'], $v['filter'], function ($content) use ($v) {
                if (!empty($v['value'])) {
                    return $v['value'];
                }
                if ("html" == $v['attr']) {
                    $content = preg_replace_callback('/<img[^>]*src=[\'"]?([^>\'"\s]*)[\'"]?[^>]*>/i', array(&$this, 'download_img_callback'), $content);
                }
                return $content;
            }];
        }
        if ('utf-8' == $this->_config['sourcecharset']) {
            $cont = QueryList::get($url);
        } else {
            $cont = QueryList::get($url)->removeHead()->encoding('UTF-8');
        }
        $cont = $cont->rules($rules)->query()->getData();
        return $cont[0];

    }

    /**
     * 转换图片地址为绝对路径,为下载做准备。
     * @param array $out 图片地址
     */
    protected function download_img_callback($matches)
    {
        return $this->download_img($matches[0], $matches[1]);
    }
    protected function download_img($old, $out)
    {
        if (!empty($old) && !empty($out) && strpos($out, '://') === false) {
            return str_replace($out, $this->url_check($out, $this->_url), $old);
        } else {
            return $old;
        }
    }

    //URL地址检查
    protected function url_check($url, $baseurl)
    {
        $urlinfo = parse_url($baseurl);
        $baseurl = $urlinfo['scheme'] . '://' . $urlinfo['host'] . (substr($urlinfo['path'], -1, 1) === '/' ? substr($urlinfo['path'], 0, -1) : str_replace('\\', '/', dirname($urlinfo['path']))) . '/';
        if (strpos($url, '://') === false) {
            if ($url[0] == '/') {
                $url = $urlinfo['scheme'] . '://' . $urlinfo['host'] . $url;
            } else {
                if (isset($this->_config['page_base'])) {
                    $url = $this->_config['page_base'] . $url;
                } else {
                    $url = $baseurl . $url;
                }
            }
        }
        return $url;
    }

    /*输出内容函数*/
    private static $echo_msg_head = null;
    public function echo_msg($str, $color = 'red', $echo = true, $end_str = '')
    {

        if (!isset(self::$echo_msg_head)) {
            self::$echo_msg_head = true;
            header('Content-type: text/html; charset=utf-8');
            header('X-Accel-Buffering: no');
            @ini_set('output_buffering', 'Off');

            ob_end_clean();
            @ob_implicit_flush(1);

            $outputSize = ini_get('output_buffering');
            $outputSize = intval($outputSize);

            if (preg_match('/\biis\b/i', $_SERVER["SERVER_SOFTWARE"])) {

                if ($outputSize < 1024 * 1024 * 4) {

                    $outputSize = 1024 * 1024 * 4;
                    echo '<!-- iis默认需输出4mb数据才能实时显示-->';
                }
            }
            echo '<style type="text/css">body{padding:0 5px;font-size:14px;color:#000;}p{padding:0;margin:0;}a{color:#aaa;}</style>';
            echo str_pad(' ', 1024 * 4);
        }
        echo '<p style="color:' . $color . ';">' . $str . '</p>' . $end_str;
        if (ob_get_level() > 0) {
            ob_flush();
            flush();
        }

    }

}