limit = 500; $this->phonecache = []; $this->deals = [ prodam => [ kvartiry => "apartments-sale", komnaty => "rooms-sale", doma_dachi_kottedzhi => "out-of-town", zemelnye_uchastki => "out-of-town/lands", kommercheskaya_nedvizhimost => "commercial-sale", ], sdam => [ kvartiry => "rent", komnaty => "rooms-rent", doma_dachi_kottedzhi => "out-of-town-rent", kommercheskaya_nedvizhimost => "commercial", ], ]; $this->categories = [ kvartiry => 1, # Квартиры komnaty => 2, # Комнаты doma_dachi_kottedzhi => 3, # Дома, дачи, коттеджи zemelnye_uchastki => 4, # Земельные участки kommercheskaya_nedvizhimost => 5, # Коммерческая недвижимость garazhi_i_mashinomesta => 6, # Гаражи и машиноместа nedvizhimost_za_rubezhom => 7, # Недвижимость за рубежом ]; $this->objects = [ "secondary" => "vtorichka", "new" => "novostroyka", "rooms-sale" => "komnaty", "rooms-rent" => "komnaty", # вид строения (!!!) "houses" => "dom", "cottage" => "kottedzh", "out-of-town-rent" => "dom", "дом" => "dom", "дача" => "dacha", "особняк" => "kottedzh", "таун-хаус" => "taunhaus", "коттедж" => "kottedzh", # Вид разрешенного использования (!!!) "lands" => "promnaznacheniya", "сельскохозяйственное производство" => "selhoznaznacheniya", "личное подсобное хозяйство (полевой или приусадебный участок)" => "selhoznaznacheniya", "садоводство или огородничество" => "selhoznaznacheniya", "дачное строительство" => "dacha", "крестьянско-фермерское хозяйство" => "selhoznaznacheniya", "индивидуальное жилищное строительство" => "izhs", # ********** "offices" => "ofis", "misc" => "drugoe", "retail" => "magazin", "production-warehouses" => "proizvodstvo", "houses" => "drugoe", "eating" => "magazin", # назначение помещения (!!!) "торговое" => "magazin", "банковское" => "drugoe", "медицинское" => "drugoe", "бытовое" => "drugoe", "спортивное" => "drugoe", "автомобильное" => "drugoe", "склад" => "sklad", "производство" => "proizvodstvo", "производство и склад" => "proizvodstvo", ]; $etc = "search/date_create=three_days/list=list/tab=users/sort/date_sort:desc/"; $this->sections = [ prodam => [ kvartiry => $etc, komnaty => $etc, doma_dachi_kottedzhi => [ "houses/{$etc}", "cottage/{$etc}", ], zemelnye_uchastki => $etc, kommercheskaya_nedvizhimost => [ "offices/{$etc}", "retail/{$etc}", "production-warehouses/{$etc}", "houses/{$etc}", "eating/{$etc}", "misc/{$etc}", ], ], sdam => [ kvartiry => "{$etc}rent_period=3674653711/", komnaty => "{$etc}rent_period=3674653711/", doma_dachi_kottedzhi => "{$etc}rent_period=3674653711/", kommercheskaya_nedvizhimost => [ "offices/{$etc}rent_period=3674653711/", "retail/{$etc}rent_period=3674653711/", "production-warehouses/{$etc}rent_period=3674653711/", "houses/{$etc}rent_period=3674653711/", "eating/{$etc}rent_period=3674653711/", "misc/{$etc}rent_period=3674653711/", ], ], ]; } public function _getPages($url, $n = 0){ if($n){ $err = "getPages({$url}) - error receiving data"; $this->update_proxy(1, -1, $err); if($n > 3) return $this->__log($err); } $this->data = $this->get_url($url, false); /*$file = $_ENV["ROOT"] . "/" . $this->site->code . "-" . $this->deal_type . "-" . $this->category . ".html"; if(!is_file($file)){ $this->data = $this->get_url($url, false); file_put_contents($file, $this->data); } else{ $this->data = file_get_contents($file); }*/ $r = preg_match('/]*\bclass="[^"]*\bjs-filtersBlock\b[^"]*"[^>]*>/', $this->data); if(!$r) return $this->_getPages($url,$n + 1); preg_match('/]*\bclass="[^"]*\bjs-paginationBlockPages\b[^"]*"[^>]*>(.*)<\/ul>/ismU', $this->data, $tmp); $r = preg_match_all('/]*>(.*)<\/li>/ismU', $tmp[1], $tmp); if(!$r) return 1; foreach($tmp[1] as &$_) $_ = (int) trim(strip_tags($_)); unset($_); return max($tmp[1]); } public function getPages($url, $n = 0){ $p = $this->_getPages($url, $n); echo "getPages: ({$p})" . PHP_EOL; return $p; } public function getItems($url, $n = 0){ if($n){ $err = "getItems({$url}) - error receiving data"; $this->update_proxy(1, -1, $err); if($n > 3){ $this->__log($err); return []; } } $data = $this->data ? $this->data : $this->get_url($url, false); $this->data = null; $r = preg_match('/]*\bclass="[^"]*\bjs-filtersBlock\b[^"]*"[^>]*>/', $data); if(!$r) return $this->getItems($url, $n + 1); // $data = preg_split('/>\s*Предложения из ближайших регионов\s*<\//ismuU',$data)[0]; $data = preg_split('/\bclass="similar__listingTitle"[^>]*>.*<\//ismuU',$data)[0]; $r = preg_match_all('/ ]+\bclass="[^"]*\bproductBlock\b[^"]*") (?=[^>]*\bdata-item-id="(?\d+)") [^>]*> (?=.*]+\bhref="(?.*)")(?=[^>]+\bclass="[^"]*\blisting__itemTitle\b[^"]*")[^>]*>(?.*)<\/a>) (?=.*<div\b[^>]+\bclass="[^"]*\blisting__itemPrice\b[^"]*"[^>]*>(?<price>.*)<\/div>) (?=.*<span[^>]+\bclass="[^"]*\blisting__itemDate\b[^"]*"[^>]*>(?<date>.*)<\/span>) .* <\/div> /ismxU', $data, $data); if(!$r) return []; $items = []; for($i = 0; $i < $r; $i++){ $item = new stdClass(); $item->id = $data["id"][$i]; $item->url = $data["url"][$i]; $item->title = trim(html_entity_decode(strip_tags($data["title"][$i]))); $item->price = preg_replace('/\D+/', '', strip_tags($data["price"][$i])); $item->price = $item->price ? $item->price . " руб." : "Не указана"; $item->date = trim(html_entity_decode(strip_tags($data["date"][$i]))); $item->date = preg_split('/\s*,\s*/', $item->date); $item->date = $this->get_date($item->date[0], $item->date[1]); $items[$item->id] = $item; } return $items; } public function getItem($item, $n = 0){ if($n){ $err = "getItem({$item->id}) - error receiving data"; $this->update_proxy(1,-1,$err); if($n > 3) return $this->__log($err); } $data = $this->get_url($item->url, false); /*$file = $_ENV["ROOT"] . "/" . $this->site->code . "-" . $item->id . ".html"; if(!is_file($file)){ $data = $this->get_url($item->url, false); file_put_contents($file, $data); } else{ $data = file_get_contents($file); }*/ $r = preg_match('/<input\b(?=[^>]*\btype="hidden")(?=[^>]*\bclass="js-advertId")(?=[^>]*\bvalue="' . $item->id . '")[^>]*>/ismU', $data, $tmp); if(!$r) return $this->getItem($item,$n + 1); # <get_title> (!) $r = preg_match("/<meta\b(?=[^>]*\bproperty=\"og[:]title\")(?=[^>]*\bcontent=\"(.*)\")[^>]*>/ismU",$data,$tmp); if($r) $item->title = $tmp[1]; # </get_title> # <get_address+seller> (!) preg_match_all('/<div\b[^>]+\bclass="productPage__infoItem"[^>]*>(.*)<\/div>/isU', $data, $tmp); $t = []; foreach($tmp[1] as $_){ if(!preg_match('/<i class="[^"]*\bicon_(.*)\b[^"]*">(.*)$/isU', $_, $_)) continue; if($_[1] == "house" || $_[1] == "head"){ $item->seller = trim(html_entity_decode(strip_tags($_[2]))); $item->display = $_[1] == "head"; } elseif($_[1] == "spot" && !$item->address){ $item->address = trim(html_entity_decode(strip_tags($_[2]))); $item->address = preg_replace('/^' . conf::$city->name . '\b\s*,\s*/isu', '', $item->address); } } # </get_address+seller> # <get_images> (???) $item->images = []; if(preg_match('/<div\b[^>]+\bclass="[^"]*\bproductPage__gallery\b[^"]*"[^>]*>(.*)<\/div>/ismU', $data, $tmp)){ preg_match_all('/<a\b[^>]+href="(.*)"[^>]*>\s*<img\b[^>]+>\s*<\/a>/ismU', $tmp[1], $tmp); $item->images = $tmp[1]; } elseif(preg_match('/<div\b[^>]+\bclass="[^"]*\blineGallery\b[^"]*"[^>]*>(.*)<\/div>/ismU', $data, $tmp)){ preg_match_all('/<img\b[^>]*\bdata-src="(.*)"[^>]*>/ismU', $tmp[1], $tmp); $item->images = $tmp[1]; } $item->images = implode(PHP_EOL, $item->images); # </get_images> # <get_description> (!) $r = preg_match("/<meta\b(?=[^>]*\bproperty=\"og[:]description\")(?=[^>]*\bcontent=\"(.*)\")[^>]*>/ismU",$data,$tmp); if($r) $item->description = $tmp[1]; # </get_description> # <get_params> (!) $params = []; $item->params = ""; /* <div class="productPage__characteristicsItem"> <span class="productPage__characteristicsItemValue">2</span> <span class="productPage__characteristicsItemTitle"> комнаты</span> </div> */ $r = preg_match_all('/<div\b[^>]+\bclass="productPage__characteristicsItem"[^>]*>\s*<span\b[^>]+\bclass="productPage__characteristicsItemValue"[^>]*>(.*)<\/span>\s*<span\b[^>]+\bclass="productPage__characteristicsItemTitle"[^>]*>(.*)<\/span>\s*<\/div>/ismU', $data, $tmp); for($i = 0; $i < $r; $i++){ $tmp[1][$i] = preg_replace('/^\s+|\s+$/u','',html_entity_decode(strip_tags($tmp[1][$i]))); $tmp[2][$i] = preg_replace('/^\s+|\s+$/u','',html_entity_decode(strip_tags($tmp[2][$i]))); $params[mb_strtolower($tmp[2][$i], "utf-8")] = $tmp[1][$i]; $item->params .= $tmp[2][$i] . ": " . $tmp[1][$i] . "; "; } // <li class="productPage__infoColumnBlockText">\s*(KEY:\s+)?VALUE\s*</li> $r = preg_match_all('/<li\b[^>]+\bclass="productPage__infoColumnBlockText"[^>]*>(.*)<\/li>/ismU', $data, $tmp); for($i = 0; $i < $r; $i++){ $tmp[1][$i] = trim(html_entity_decode(strip_tags($tmp[1][$i]))); if(preg_match('/^(.+):\s*(.+)$/isU', $tmp[1][$i], $t)){ $tmp[1][$i] = $params[mb_strtolower($t[1],"utf-8")] = $t[2]; $item->params .= $t[1] . ": "; } $item->params .= $tmp[1][$i] . "; "; } $item->params = trim($item->params); /* (??) */ if(isset($params["этаж"])){ $item->level = intval($params["этаж"]); if(preg_match('/^(\d+)\D+(\d+)$/is', $params["этаж"], $tmp)) $item->levels = $tmp[2]; } $item->material = $params["материал стен"]; $item->land = __floatval($params["площадь участка"]); foreach(["этажей в здании","количество этажей"] as $_){ if(isset($params[$_])){ $item->levels = intval($params[$_]); if($item->levels) break; } } foreach(["площадь арендуемой комнаты","площадь продажи","общая площадь квартиры","общая площадь","жилая площадь","площадь строения"] as $_){ if(isset($params[$_])){ $item->s = __floatval($params[$_]); if($item->s) break; } } if(!$item->s){ $item->s = self::parse_square($item->title . PHP_EOL . $item->description); } foreach(["комнаты","количество комнат на продажу","комнат в квартире/общежитии","количество комнат","комнат в квартире"] as $_){ if(isset($params[$_])){ $item->rooms = intval($params[$_]); if($item->rooms) break; } } if(!$item->rooms && ($this->category == "kvartiry" || $this->category == "komnaty")) { $item->rooms = self::parse_rooms($item->title . PHP_EOL . $item->description, $this->category); } if(!$item->land && ($this->category == "zemelnye_uchastki" || $this->category == "doma_dachi_kottedzhi")){ $item->land = self::parse_land($item->title . PHP_EOL . $item->description); } # </get_params> # <get_raion> $item->raion = $params["район города"]; # </get_raion> # <get_phones> $L = $this->cities[$this->city]["phonelength"]; $item->phone = []; if(preg_match('/<div\b(?=[^>]+\bclass="[^"]*\bproductPage__phoneText\b[^"]*")(?=[^>]+\bdata-phone="(.*)")[^>]*>/ismU', $data, $tmp)){ $phones = preg_split('/[^\d\(\)\s-]+/is', base64_decode($tmp[1])); foreach($phones as $_){ $_ = trim($_); if(mb_strlen($_, "utf-8") >= $L) $item->phone[] = $_; } } elseif(preg_match('/\bт(?:ел(?:ефон))?\b(?:\p{P}\s*|\s+)([+-\d\(\)\s]{' . $L . ',})/ismu', $item->description, $tmp)){ $item->phone[] = $tmp[1]; } $code = substr($this->cities[$this->city]["code"], 1); $re = "[78](9\d{9,}|{$code}\d{{$L},})"; $n = strlen($this->cities[$this->city]["code"]) + $L; foreach($item->phone as &$_){ $_ = preg_replace("/^\(/", "8", $_); $_ = preg_replace("/\D/", "", $_); if(preg_match("/^789\d{9,}/", $_)) $_ = preg_replace("/^7/", "", $_); if(preg_match("/^9\d{9,}/", $_)) $_ = "8" . $_; if(!preg_match("/^{$re}/", $_)) $_ = $this->cities[$this->city]["code"] . $_; $N = preg_match("/^[78]{$code}/", $_) ? $n : 11; $_ = mb_substr($_, 0, $N, "utf-8"); } unset($_); $item->display = $item->display && $this->checkPhone($item->phone); $item->phone = implode(PHP_EOL, $item->phone); # </get_phones> $this->saveItem($item); return $item; } public function create_url($urlsfx, $p = 1){ $url = "http://{$this->city}.{$this->site->host}/real-estate/{$this->deals[$this->deal_type][$this->category]}/{$urlsfx}"; if($p > 1) $url .= "page{$p}/"; return $url; } public function get_url($url, $proxy = true, $opts = [], $n = 1) { if(!$url || !preg_match("/^https?[:]\/\//",$url)) return $this->error("get_url: incorrect url = {$url}"); if($n > $this->fails / 3) { if($proxy) $proxy = false; else{ $proxy = true; $n = 1; } } $log = "get_url: url = {$url}"; $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER,1); curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); curl_setopt($ch, file_exists($this->cookies) ? CURLOPT_COOKIEFILE : CURLOPT_COOKIEJAR, $this->cookies); $headers = []; $headers = is_array($opts) ? $opts : [$opts]; $headers[] = "Accept-Language: ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3"; $headers[] = "Cache-Control: max-age=0"; $headers[] = "Connection: keep-alive"; $headers[] = "User-Agent: Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0"; if($proxy){ curl_setopt($ch, CURLOPT_MAXREDIRS, 2); $this->get_proxy($proxy === true ? "" : $proxy); curl_setopt($ch, CURLOPT_PROXY, $this->proxy); $log .= ", proxy = {$this->proxy}"; $headers[] = "X-Forwarded-For: {$this->__proxy->ip}"; } else{ $this->__proxy = null; $this->proxy = null; sleep(3); } $this->__log($log); curl_setopt($ch, CURLOPT_HTTPHEADER, $headers); $ss = curl_exec($ch); $er = curl_error($ch); $ci = curl_getinfo($ch); curl_close($ch); $f = $er || $ci["http_code"] != 200; if($proxy) $this->update_proxy($f,$ci["http_code"],$er); else if($ci["http_code"] == 404) return false; if($f){ $this->__log("{$log} fails (err = {$er}, http_code = {$ci["http_code"]}), retry..."); return $this->get_url($url,$proxy,$opts,$n + 1); } return $ss; } public function session_start(){ $this->cookies = "{$_ENV[ROOT]}/../logs/{$this->site->code}.cookie"; @unlink($this->cookies); } public function session_end() { } public function _session_end(){ @unlink("{$_ENV[ROOT]}/../pids/{$this->site->code}.pid"); @unlink($this->cookies); } /*public function checkItem($item){ $rt = mysql_fetch_assoc(mysql_query("SELECT * FROM objects WHERE site_id = '{$this->site->id}' AND id = '{$item->id}'")); return $rt ? 0 : 1; }*/ public function grab(){ foreach($this->cities as $city => $city_id){ foreach($this->sections as $deal_type => $categories){ foreach($categories as $category => $params){ $this->__log("\n\n=========\n grab($city, $deal_type, $category);\n=========\n"); parent::grab($city, $deal_type, $category); } } } $this->_session_end(); } public function cron(){ $logfile = "{$_ENV[ROOT]}/../logs/{$this->site->code}.log"; $pidfile = "{$_ENV[ROOT]}/../pids/{$this->site->code}.pid"; if(is_file($pidfile)){ $pid = file_get_contents($pidfile); shell_exec("kill -9 {$pid} > /dev/null"); } $cmd = "php {$_ENV[ROOT]}/grabber.php {$this->site->code} > {$logfile} & echo $!"; echo "[",date("Y-M-d H:i:s"),"] ", $cmd ,"\n"; $pid = shell_exec($cmd); file_put_contents($pidfile,$pid); } }