利用php采集阿里巴巴上面搜索的数据,单纯获取搜索出来的,公司名称,地区,还有就是诚信通年份。
今天我们以关键词“机械”来做例子。然后通过抓包,发现了。。。
http://s.1688.com/selloffer/rpc_async_render.jsonp?rpcflag=new&_serviceId_=marketOfferResultViewService&startIndex=0&_template_=controls%2Fnew_template%2Fproducts%2Fmarketoffersearch%2Fofferresult%2Fpkg-a%2Fviews%2Fofferresult.vm&keywords=%BB%FA%D0%B5&enableAsync=true&earseDirect=false&button_click=top&asyncCount=20&n=y&offset=8&async=true&uniqfield=pic_tag_id&token=2321131414&callback=jQuery183037169543863274157_1445569209470&beginPage=1
1.地址净化
a.&_=1445569929894 可以删除
b.callback=jQuery183037169543863274157_1445569209470改成callback=jQuery
变成:
http://s.1688.com/selloffer/rpc_async_render.jsonp?rpcflag=new&_serviceId_=marketOfferResultViewService&startIndex=0&_template_=controls%2Fnew_template%2Fproducts%2Fmarketoffersearch%2Fofferresult%2Fpkg-a%2Fviews%2Fofferresult.vm&keywords=%BB%FA%D0%B5&enableAsync=true&earseDirect=false&button_click=top&asyncCount=20&n=y&offset=8&async=true&uniqfield=pic_tag_id&token=2321131414&callback=jQuery&beginPage=1
2.参数简单分析
a. keywords=%BB%FA%D0%B5 关键词
b. asyncCount=20 因为每一页有60个产品展示,每20个一批出现,所以一页有3个json请求。第一批,20,第二批40,第三批60,所以这里果断设置 asynvCount=60个,这就是一页的60个数据。
c. beginPage=1 第几页
说明,其实不止这些参数,还可以做更深入的筛选。例如,综合,销量,价格排序。价钱范围。地区,经营模式,合并供应商等。但博主只需要某关键词获取的公司名,公司地址,诚信通,这里可以举一反三。
3.数据筛选
其实我觉得这里更像是一个json数据,
4.php源码
使用了thinkphp框架,因为比较快,而且方便。
class IndexAction extends Action { public function index(){ $asyncCount=60;//一页多少个 $startPage =1;//第一页开始 $beginPage =40;//一共有多少页 set_time_limit(0); for($i=$startPage;$i<=$beginPage;$i++){ $url ="http://s.1688.com/selloffer/rpc_async_render.jsonp?earseDirect=false&n=y&showStyle=shopwindow&_=1445493566796&rpcflag=new&_serviceId_=marketOfferResultViewService&_template_=controls%2Fnew_template%2Fproducts%2Fmarketoffersearch%2Fofferresult%2Fpkg-a%2Fviews%2Fofferresult.vm&startIndex=0&keywords=%C4%BE%B9%A4%BB%FA%D0%B5&showMySearchUrl=true&async=true&filt=y&maxPage=100&enableAsync=true&asyncCount=60&offset=9&priceEnd=3.4028235E38&uniqfield=userid&beginPage=$i&token=2321131414&callback=jQuery"; copyright howingwah //header('content-type:application/json;charset=utf8'); $output = $this->http($url); $output = str_replace(array("jQuery("),"",$output); $output = substr($output,0,-2);//去除最后一个) $output = stripslashes($output); $output = str_replace(array("n ")," ",$output); //$output = str_replace(array(" "),"",$output); //$output = str_replace(array(" "),"",$output); $output = str_replace(array("n n "),"",$output); //$output = str_replace(array("'\'"),"",$output); $output = str_replace(array("nn n"),"",$output); $output = str_replace(array("nnn"),"",$output); $output = str_replace(array("</div>n</"),"</div></",$output); $output = str_replace(array("</li>n <div"),"</li><div",$output); $output = str_replace(array("</a>n</div>"),"</a></div>",$output); $output = str_replace(array("div>n<div"),"div><div",$output); //$output = str_replace(array('"'),"",$output); $output = str_replace(array('|'),"",$output); $output = str_replace(array('{}.{}.0.0'),"",$output); //$output = str_replace(array(':'),"",$output); //$pattern .='/<script(.*)<\/script>/i'; //$pattern .='/<div class="sm-offer-trigger"(.*)"/i'; $patterns = array( '/<script(.*)<\/script>/i', '/<div class="sm-offer-trigger"(.*)"/i', '/href="(.*?)"/i', '/<span>(.*?)">/i', ); $output = str_replace(array('<span class="sm-offerimg-companysep"></span>'),"",$output); $output=preg_replace($patterns,'',$output); $arr = explode('<li class="sm-offer-item sw-dpl-offer-item "',$output); unset($arr[0]); $data = array(); foreach($arr as $k=>$v){ preg_match('/<a class="sm-offer-companyName sw-dpl-offer-companyName "[^>]*>(.*?)<\/a>/', $v, $company); $company = str_replace(array('n'),"",$company[1]); preg_match('/<a class="sm-offer-location sm-offer-chanyedai"[^>]*>(.*?)<\/a>/', $v, $location); $location = str_replace(array('n'),"",$location[1]); preg_match('/<div class="sm-offer-location"[^>]*>(.*?)<\/div>/', $v, $place); $place = str_replace(array('n'),"",$place[1]); preg_match('/<a class="sw-ui-flaticon-cxt16x16"[^>]*>(.*?)<\/a>/', $v, $year1); $year1 = str_replace(array('n'),"",$year1[1]); preg_match('/<a cindex="[^>]*>(.*?)<\/a>/', $v, $year2); $year2 = str_replace(array('n'),"",$year2[1]); $data[$k]['company'] = $company; $data[$k]['place'] = $location?$location:$place; copyright howingwah $data[$k]['year'] = $year1?$year1:$year2; } header("Content-Type: text/html; charset=utf-8"); $data = $this->charsetToUTF8($data); foreach($data as $value){ $add['company'] = $value['company']; $add['place'] = $value['place']; $add['year'] = $value['year']; $sql1688 = M('1688'); $sql1688->add($add); } //print_r($data); }; } function http($url, $params, $method = 'POST', $header = array(), $multi = false){ $opts = array( CURLOPT_TIMEOUT => 60, CURLOPT_RETURNTRANSFER => 1, CURLOPT_SSL_VERIFYPEER => false, CURLOPT_SSL_VERIFYHOST => false, CURLOPT_HTTPHEADER => $header ); /* 根据请求类型设置特定参数 */ switch(strtoupper($method)){ case 'GET': $opts[CURLOPT_URL] = $url . '?' . http_build_query($params); break; case 'POST': //判断是否传输文件 $params = $multi ? $params : http_build_query($params); $opts[CURLOPT_URL] = $url; $opts[CURLOPT_POST] = 1; $opts[CURLOPT_POSTFIELDS] = $params; break; default: throw new Exception('不支持的请求方式!'); } /* 初始化并执行curl请求 */ $ch = curl_init(); curl_setopt_array($ch, $opts); $data = curl_exec($ch); $error = curl_error($ch); curl_close($ch); if($error) throw new Exception('请求发生错误:' . $error); return $data; } function charsetToUTF8($mixed)//字符串转UTF-8 { if (is_array($mixed)) { foreach ($mixed as $k => $v) { if (is_array($v)) { $mixed[$k] = $this->charsetToUTF8($v); } else { $encode = mb_detect_encoding($v, array('ASCII', 'UTF-8', 'GB2312', 'GBK', 'BIG5')); if ($encode == 'EUC-CN') { $mixed[$k] = iconv('GBK', 'UTF-8', $v); } } } } else { $encode = mb_detect_encoding($mixed, array('ASCII', 'UTF-8', 'GB2312', 'GBK', 'BIG5')); if ($encode == 'EUC-CN') { $mixed = iconv('GBK', 'UTF-8', $mixed); } } return $mixed; } }
5.乱码问题
采集到的数据,插入数据库的时候一直显示乱码,无论直接用iconv, mb_detect_encoding都不行。
后来在网上找到了解决方案。
/** * 将非GBK字符集的编码转为GBK * * @param mixed $mixed 源数据 * * @return mixed GBK格式数据 */ function charsetToGBK($mixed) { if (is_array($mixed)) { foreach ($mixed as $k => $v) { if (is_array($v)) { $mixed[$k] = charsetToGBK($v); 内容来自howingwah } else { $encode = mb_detect_encoding($v, array('ASCII', 'UTF-8', 'GB2312', 'GBK', 'BIG5')); if ($encode == 'UTF-8') { $mixed[$k] = iconv('UTF-8', 'GBK', $v); } } } } else { $encode = mb_detect_encoding($mixed, array('ASCII', 'UTF-8', 'GB2312', 'GBK', 'BIG5')); //var_dump($encode); if ($encode == 'UTF-8') { $mixed = iconv('UTF-8', 'GBK', $mixed); } } return $mixed; } /** * 将非UTF-8字符集的编码转为UTF-8 * * @param mixed $mixed 源数据 * * @return mixed utf-8格式数据 */ function charsetToUTF8($mixed) { if (is_array($mixed)) { foreach ($mixed as $k => $v) { if (is_array($v)) { $mixed[$k] = charsetToUTF8($v); } else { $encode = mb_detect_encoding($v, array('ASCII', 'UTF-8', 'GB2312', 'GBK', 'BIG5')); if ($encode == 'EUC-CN') { $mixed[$k] = iconv('GBK', 'UTF-8', $v); } } } } else { $encode = mb_detect_encoding($mixed, array('ASCII', 'UTF-8', 'GB2312', 'GBK', 'BIG5')); if ($encode == 'EUC-CN') { $mixed = iconv('GBK', 'UTF-8', $mixed); } } return $mixed; }