利用php采集阿里巴巴上面搜索的数据,单纯获取搜索出来的,公司名称,地区,还有就是诚信通年份。
今天我们以关键词“机械”来做例子。然后通过抓包,发现了。。。
http://s.1688.com/selloffer/rpc_async_render.jsonp?rpcflag=new&_serviceId_=marketOfferResultViewService&startIndex=0&_template_=controls%2Fnew_template%2Fproducts%2Fmarketoffersearch%2Fofferresult%2Fpkg-a%2Fviews%2Fofferresult.vm&keywords=%BB%FA%D0%B5&enableAsync=true&earseDirect=false&button_click=top&asyncCount=20&n=y&offset=8&async=true&uniqfield=pic_tag_id&token=2321131414&callback=jQuery183037169543863274157_1445569209470&beginPage=1
1.地址净化
a.&_=1445569929894 可以删除
b.callback=jQuery183037169543863274157_1445569209470改成callback=jQuery

变成:
http://s.1688.com/selloffer/rpc_async_render.jsonp?rpcflag=new&_serviceId_=marketOfferResultViewService&startIndex=0&_template_=controls%2Fnew_template%2Fproducts%2Fmarketoffersearch%2Fofferresult%2Fpkg-a%2Fviews%2Fofferresult.vm&keywords=%BB%FA%D0%B5&enableAsync=true&earseDirect=false&button_click=top&asyncCount=20&n=y&offset=8&async=true&uniqfield=pic_tag_id&token=2321131414&callback=jQuery&beginPage=1
2.参数简单分析
a. keywords=%BB%FA%D0%B5 关键词
b. asyncCount=20 因为每一页有60个产品展示,每20个一批出现,所以一页有3个json请求。第一批,20,第二批40,第三批60,所以这里果断设置 asynvCount=60个,这就是一页的60个数据。
c. beginPage=1 第几页
说明,其实不止这些参数,还可以做更深入的筛选。例如,综合,销量,价格排序。价钱范围。地区,经营模式,合并供应商等。但博主只需要某关键词获取的公司名,公司地址,诚信通,这里可以举一反三。
3.数据筛选
其实我觉得这里更像是一个json数据,
4.php源码
使用了thinkphp框架,因为比较快,而且方便。
class IndexAction extends Action {
public function index(){
$asyncCount=60;//一页多少个
$startPage =1;//第一页开始
$beginPage =40;//一共有多少页
set_time_limit(0);
for($i=$startPage;$i<=$beginPage;$i++){
$url ="http://s.1688.com/selloffer/rpc_async_render.jsonp?earseDirect=false&n=y&showStyle=shopwindow&_=1445493566796&rpcflag=new&_serviceId_=marketOfferResultViewService&_template_=controls%2Fnew_template%2Fproducts%2Fmarketoffersearch%2Fofferresult%2Fpkg-a%2Fviews%2Fofferresult.vm&startIndex=0&keywords=%C4%BE%B9%A4%BB%FA%D0%B5&showMySearchUrl=true&async=true&filt=y&maxPage=100&enableAsync=true&asyncCount=60&offset=9&priceEnd=3.4028235E38&uniqfield=userid&beginPage=$i&token=2321131414&callback=jQuery";
copyright howingwah
//header('content-type:application/json;charset=utf8');
$output = $this->http($url);
$output = str_replace(array("jQuery("),"",$output);
$output = substr($output,0,-2);//去除最后一个)
$output = stripslashes($output);
$output = str_replace(array("n ")," ",$output);
//$output = str_replace(array(" "),"",$output);
//$output = str_replace(array(" "),"",$output);
$output = str_replace(array("n n "),"",$output);
//$output = str_replace(array("'\'"),"",$output);
$output = str_replace(array("nn n"),"",$output);
$output = str_replace(array("nnn"),"",$output);
$output = str_replace(array("</div>n</"),"</div></",$output);
$output = str_replace(array("</li>n <div"),"</li><div",$output);
$output = str_replace(array("</a>n</div>"),"</a></div>",$output);
$output = str_replace(array("div>n<div"),"div><div",$output);
//$output = str_replace(array('"'),"",$output);
$output = str_replace(array('|'),"",$output);
$output = str_replace(array('{}.{}.0.0'),"",$output);
//$output = str_replace(array(':'),"",$output);
//$pattern .='/<script(.*)<\/script>/i';
//$pattern .='/<div class="sm-offer-trigger"(.*)"/i';
$patterns = array(
'/<script(.*)<\/script>/i',
'/<div class="sm-offer-trigger"(.*)"/i',
'/href="(.*?)"/i',
'/<span>(.*?)">/i',
);
$output = str_replace(array('<span class="sm-offerimg-companysep"></span>'),"",$output);
$output=preg_replace($patterns,'',$output);
$arr = explode('<li class="sm-offer-item sw-dpl-offer-item "',$output);
unset($arr[0]);
$data = array();
foreach($arr as $k=>$v){
preg_match('/<a class="sm-offer-companyName sw-dpl-offer-companyName "[^>]*>(.*?)<\/a>/', $v, $company);
$company = str_replace(array('n'),"",$company[1]);
preg_match('/<a class="sm-offer-location sm-offer-chanyedai"[^>]*>(.*?)<\/a>/', $v, $location);
$location = str_replace(array('n'),"",$location[1]);
preg_match('/<div class="sm-offer-location"[^>]*>(.*?)<\/div>/', $v, $place);
$place = str_replace(array('n'),"",$place[1]);
preg_match('/<a class="sw-ui-flaticon-cxt16x16"[^>]*>(.*?)<\/a>/', $v, $year1);
$year1 = str_replace(array('n'),"",$year1[1]);
preg_match('/<a cindex="[^>]*>(.*?)<\/a>/', $v, $year2);
$year2 = str_replace(array('n'),"",$year2[1]);
$data[$k]['company'] = $company;
$data[$k]['place'] = $location?$location:$place;
copyright howingwah
$data[$k]['year'] = $year1?$year1:$year2;
}
header("Content-Type: text/html; charset=utf-8");
$data = $this->charsetToUTF8($data);
foreach($data as $value){
$add['company'] = $value['company'];
$add['place'] = $value['place'];
$add['year'] = $value['year'];
$sql1688 = M('1688');
$sql1688->add($add);
}
//print_r($data);
};
}
function http($url, $params, $method = 'POST', $header = array(), $multi = false){
$opts = array(
CURLOPT_TIMEOUT => 60,
CURLOPT_RETURNTRANSFER => 1,
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_SSL_VERIFYHOST => false,
CURLOPT_HTTPHEADER => $header
);
/* 根据请求类型设置特定参数 */
switch(strtoupper($method)){
case 'GET':
$opts[CURLOPT_URL] = $url . '?' . http_build_query($params);
break;
case 'POST':
//判断是否传输文件
$params = $multi ? $params : http_build_query($params);
$opts[CURLOPT_URL] = $url;
$opts[CURLOPT_POST] = 1;
$opts[CURLOPT_POSTFIELDS] = $params;
break;
default:
throw new Exception('不支持的请求方式!');
}
/* 初始化并执行curl请求 */
$ch = curl_init();
curl_setopt_array($ch, $opts);
$data = curl_exec($ch);
$error = curl_error($ch);
curl_close($ch);
if($error) throw new Exception('请求发生错误:' . $error);
return $data;
}
function charsetToUTF8($mixed)//字符串转UTF-8
{
if (is_array($mixed)) {
foreach ($mixed as $k => $v) {
if (is_array($v)) {
$mixed[$k] = $this->charsetToUTF8($v);
} else {
$encode = mb_detect_encoding($v, array('ASCII', 'UTF-8', 'GB2312', 'GBK', 'BIG5'));
if ($encode == 'EUC-CN') {
$mixed[$k] = iconv('GBK', 'UTF-8', $v);
}
}
}
} else {
$encode = mb_detect_encoding($mixed, array('ASCII', 'UTF-8', 'GB2312', 'GBK', 'BIG5'));
if ($encode == 'EUC-CN') {
$mixed = iconv('GBK', 'UTF-8', $mixed);
}
}
return $mixed;
}
}
5.乱码问题
采集到的数据,插入数据库的时候一直显示乱码,无论直接用iconv, mb_detect_encoding都不行。
后来在网上找到了解决方案。
/**
* 将非GBK字符集的编码转为GBK
*
* @param mixed $mixed 源数据
*
* @return mixed GBK格式数据
*/
function charsetToGBK($mixed)
{
if (is_array($mixed)) {
foreach ($mixed as $k => $v) {
if (is_array($v)) {
$mixed[$k] = charsetToGBK($v);
内容来自howingwah
} else {
$encode = mb_detect_encoding($v, array('ASCII', 'UTF-8', 'GB2312', 'GBK', 'BIG5'));
if ($encode == 'UTF-8') {
$mixed[$k] = iconv('UTF-8', 'GBK', $v);
}
}
}
} else {
$encode = mb_detect_encoding($mixed, array('ASCII', 'UTF-8', 'GB2312', 'GBK', 'BIG5'));
//var_dump($encode);
if ($encode == 'UTF-8') {
$mixed = iconv('UTF-8', 'GBK', $mixed);
}
}
return $mixed;
}
/**
* 将非UTF-8字符集的编码转为UTF-8
*
* @param mixed $mixed 源数据
*
* @return mixed utf-8格式数据
*/
function charsetToUTF8($mixed)
{
if (is_array($mixed)) {
foreach ($mixed as $k => $v) {
if (is_array($v)) {
$mixed[$k] = charsetToUTF8($v);
} else {
$encode = mb_detect_encoding($v, array('ASCII', 'UTF-8', 'GB2312', 'GBK', 'BIG5'));
if ($encode == 'EUC-CN') {
$mixed[$k] = iconv('GBK', 'UTF-8', $v);
}
}
}
} else {
$encode = mb_detect_encoding($mixed, array('ASCII', 'UTF-8', 'GB2312', 'GBK', 'BIG5'));
if ($encode == 'EUC-CN') {
$mixed = iconv('GBK', 'UTF-8', $mixed);
}
}
return $mixed;
}
