09
12月
2021
$url = 'https://item.jd.com/10038939766023.html';
//针对京东进行源码采集
if(strpos($post['link'],'item.jd.com') !== false){
//获取详情介绍的数据
$ch = curl_init();
$header = array (
'User-Agent: Mozilla/5.0 (Windows NT 5.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36','X-FORWARDED-FOR:154.125.25.15', 'CLIENT-IP:154.125.25.15'
);
curl_setopt($ch, CURLOPT_HTTPHEADER, $header); //构造用户IP
curl_setopt($ch, CURLOPT_REFERER, "http://www.baidu.com/");//构造来路
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_URL, $url);
$html = curl_exec($ch);
curl_close($ch);
$preg = "/desc:[\s]+\'(.*?)\'/ims";
preg_match($preg,$html,$match);
if(empty($match[1])){
return $this->showMessage('当前商品介绍文件找不到,不可采集');
}
// $desc_url = 'https://cd.jd.com/description/channel?skuId=商品id&mainSkuId=某id&charset=utf-8&cdn=2';这是获取详情的链接
$desc_url = $match[1];
if(strpos($desc_url,'http') === false){
$desc_url = 'http:'.$desc_url;
}
$id = substr($url,strripos($url,'/')+1,-5);
$ch = curl_init();
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_URL, $desc_url);
$desc = curl_exec($ch);
curl_close($ch);
$desc = json_decode($desc,true);
$desc = $desc['content'];
if(strpos($desc,'data-lazyload') !== false){
//直接获取图片 img标签中data-lazyload的属性值
$preg = "/<img.*?data-lazyload=\"(.*?)\".*?>/ims";
preg_match_all($preg,$desc,$match);
}elseif(strpos($desc,'ssd-module-wrap') !== false){
//获取背景图 background-image:url(这里的内容)
$preg = "/background-image\:url\((.*?)\)/ims";
preg_match_all($preg,$desc,$match);
}else{
//没有的话不进行相关采集操作,因为目前查看的就这两种情况,到时候有新的情况添加下去就可以了
exit('当前商品介绍未找到,不可采集');
}
$price_url = 'https://p.3.cn/prices/mgets?skuIds=J_'.$id;//获取价格的链接(其中有多个价格,我只取其一)
$ch = curl_init();
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_URL, $price_url);
$price = curl_exec($ch);
curl_close($ch);
$price = json_decode($price,true);
if(empty($price[0]['p'])){
exit('当前商品价格未找到,不可采集');
}
$price = $price[0]['p'];
$ql = \QueryList::get($url);//这是QueryList类(快准狠),不需要自己写方法采集
$title = $ql->find('.sku-name')->text();
if(empty($title)){
exit('当前商品标题未找到,不可采集');
}
$pics = $ql->find('#spec-list ul li img')->attrs('data-url');
if(empty($pics)){
exit('当前商品封面图未找到,不可采集');
}
$arr = [];
$arr['content'] = '';//京东大多都是图片展示的详情介绍(我没遇到文字的,所以底下都是根据图片处理)
$arr['price'] = $price;//价格
$arr['title'] = $title;//标题
foreach($match[1] as $k=>$v){
if(strpos($v,'http') === false){
//这是因为有可能不存在http,到时候在保存时候curl会出现报错
$v = 'http:'.$v;
}
$arr['content'] .= '<img src="'.$v.'">';
//需要保存参考底下pics中的保存
}
$arr['pics'] = [];
foreach($pics as $v){
$v = 'https://img14.360buyimg.com/n1/'.$v;//图片路径,修改(n0-5)可查看各类大小
$arr['pics'][] = $v;
//通过当前连接保存到自己云上,如不需要,底下可忽略
$ch = curl_init();
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_URL, $v);
$tmp_name = curl_exec($ch);//保存使用变量
curl_close($ch);
$head = get_headers($v,1);
$mime = $head['Content-Type'];
$size = $head['Content-Length'];
$type_arr = explode('/',$mime);
$suffix = count($type_arr) == 2 ? $type_arr[1] : 'jpg';
$imgs = getimagesize('data://image/jpeg;base64,'. base64_encode($tmp_name));
$save['width'] = $imgs[0];//图片宽度
$save['height'] = $imgs[1];//图片高度
$save['size'] = $size;//图片大小
$save['type'] = $mime;//图片类型
$save['suffix'] = $suffix;
$name = md5(uniqid(microtime(true),true));//随机不重复文件名
$path = $name.'.'.$suffix;//返回一个默认路径,
$save['title'] = $name;
$save['path'] = $path;
//$save 存放图片的一些参数值
//接下来保存save参数就自己写了
}
dd($arr);
//采集数据都存放在$arr中
//以上不可采集的逻辑是过程中多刷新几次就无法获取到,原因还不知,先想办法这里过滤掉
}