09 12 2021
$url = 'https://item.jd.com/10038939766023.html';
//针对京东进行源码采集
if(strpos($post['link'],'item.jd.com') !== false){
	//获取详情介绍的数据
	$ch = curl_init();
	$header = array (
	'User-Agent: Mozilla/5.0 (Windows NT 5.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36','X-FORWARDED-FOR:154.125.25.15', 'CLIENT-IP:154.125.25.15'
	);
	curl_setopt($ch, CURLOPT_HTTPHEADER, $header); //构造用户IP
	curl_setopt($ch, CURLOPT_REFERER, "http://www.baidu.com/");//构造来路
	curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
	curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
	curl_setopt($ch, CURLOPT_URL, $url);
	$html = curl_exec($ch);
	curl_close($ch);
	$preg = "/desc:[\s]+\'(.*?)\'/ims";
	preg_match($preg,$html,$match);
	if(empty($match[1])){
		return $this->showMessage('当前商品介绍文件找不到,不可采集');
	}
	// $desc_url = 'https://cd.jd.com/description/channel?skuId=商品id&mainSkuId=某id&charset=utf-8&cdn=2';这是获取详情的链接
	$desc_url = $match[1];
	if(strpos($desc_url,'http') === false){
		$desc_url = 'http:'.$desc_url;
	}
	$id = substr($url,strripos($url,'/')+1,-5);
	$ch = curl_init();
	curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
	curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
	curl_setopt($ch, CURLOPT_URL, $desc_url);
	$desc = curl_exec($ch);
	curl_close($ch);
	$desc = json_decode($desc,true);
	$desc = $desc['content'];
	if(strpos($desc,'data-lazyload') !== false){
		//直接获取图片 img标签中data-lazyload的属性值
		$preg = "/<img.*?data-lazyload=\"(.*?)\".*?>/ims";
		preg_match_all($preg,$desc,$match);
		
	}elseif(strpos($desc,'ssd-module-wrap') !== false){
		//获取背景图 background-image:url(这里的内容)
		$preg = "/background-image\:url\((.*?)\)/ims";
		preg_match_all($preg,$desc,$match);
		
	}else{
		//没有的话不进行相关采集操作,因为目前查看的就这两种情况,到时候有新的情况添加下去就可以了
		exit('当前商品介绍未找到,不可采集');
	}
	$price_url = 'https://p.3.cn/prices/mgets?skuIds=J_'.$id;//获取价格的链接(其中有多个价格,我只取其一)
	$ch = curl_init();
	curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
	curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
	curl_setopt($ch, CURLOPT_URL, $price_url);
	$price = curl_exec($ch);
	curl_close($ch);
	$price = json_decode($price,true);
	if(empty($price[0]['p'])){
		exit('当前商品价格未找到,不可采集');
	}
	$price = $price[0]['p'];
	$ql = \QueryList::get($url);//这是QueryList类(快准狠),不需要自己写方法采集
	$title = $ql->find('.sku-name')->text();
	if(empty($title)){
		exit('当前商品标题未找到,不可采集');
	}
	$pics = $ql->find('#spec-list ul li img')->attrs('data-url');
	if(empty($pics)){
		exit('当前商品封面图未找到,不可采集');
	}
	$arr = [];
	$arr['content'] = '';//京东大多都是图片展示的详情介绍(我没遇到文字的,所以底下都是根据图片处理)
	$arr['price'] = $price;//价格
	$arr['title'] = $title;//标题
	foreach($match[1] as $k=>$v){
		if(strpos($v,'http') === false){
			//这是因为有可能不存在http,到时候在保存时候curl会出现报错
			$v = 'http:'.$v;
		}
		$arr['content'] .= '<img src="'.$v.'">';
		//需要保存参考底下pics中的保存
	}
	$arr['pics'] = [];
	foreach($pics as $v){
		$v = 'https://img14.360buyimg.com/n1/'.$v;//图片路径,修改(n0-5)可查看各类大小
		$arr['pics'][] = $v;
		//通过当前连接保存到自己云上,如不需要,底下可忽略
		$ch = curl_init();
		curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
		curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
		curl_setopt($ch, CURLOPT_URL, $v);
		$tmp_name = curl_exec($ch);//保存使用变量
		curl_close($ch);
		$head = get_headers($v,1);
		$mime = $head['Content-Type'];
		$size = $head['Content-Length'];
		$type_arr = explode('/',$mime);
		$suffix = count($type_arr) == 2 ? $type_arr[1] : 'jpg';
		$imgs = getimagesize('data://image/jpeg;base64,'. base64_encode($tmp_name));
		$save['width'] = $imgs[0];//图片宽度
		$save['height'] = $imgs[1];//图片高度
		$save['size'] = $size;//图片大小
		$save['type'] = $mime;//图片类型
		$save['suffix'] = $suffix;
		$name = md5(uniqid(microtime(true),true));//随机不重复文件名
		$path = $name.'.'.$suffix;//返回一个默认路径,
		$save['title'] = $name;
		$save['path'] = $path;
		//$save 存放图片的一些参数值
		//接下来保存save参数就自己写了
		
	}
	dd($arr);
	//采集数据都存放在$arr中
	//以上不可采集的逻辑是过程中多刷新几次就无法获取到,原因还不知,先想办法这里过滤掉
	
}
延伸阅读
    发表评论