timeout = 30; $this->check_olditems = 1; $this->limit = 21; // Формирование ссылки $this->deals = [ prodam => 1, # Продам sdam => 2, # Сдам ]; $this->categories = [ kvartiry => 1, # Квартиры komnaty => 2, # Комнаты doma => 3, # Дома, коттеджи zemlya => 4, # Земельные участки commercial => 5, # Коммерческая недвижимость garazhi => 6, # Гаражи dachi => 7, # Дачи ]; } public function getPages($url, $n = 0) { if($n){ $err = "getPages({$url}) - error receiving data"; $this->update_proxy(1, -1, $err); if($n > $this->fails){ $this->__log($err); return []; } } $data = $this->data ? $this->data : $this->get_url($url, true); /* вычисляем количество страницы */ $r = preg_match('/page-pagination-list[^<]*(.*)/i', $data, $tmp); if (!$r) return 1; $r = preg_match_all('/href=\"[^>]+>(\d+)<\/a[^<]+<\/li>/i', $tmp[1], $tmp); if (!$r) return 1; $p = max($tmp[1]); return $p; } public function getItems($url, $n = 0){ if($n){ $err = "getItems({$url}) - error receiving data"; $this->update_proxy(1, -1, $err); if($n > $this->fails){ $this->__log($err); return []; } } $data = $this->data ? $this->data : $this->get_url($url, true); /* Здесь обрабатываем ошибку загрузки страницы $r = preg_match("/Подождите, идет загрузка/", $this->data); if($r) return $this->getItems($url, $n + 1); $this->data = null; */ $r = preg_match('/
getItems($url,$n+1); $items = [] ; // $r = preg_match_all('/]*>([^<]*)([^<]*)[^>]*>([^<]*)[^\"]*[^\"]*[^>]*>([^<]*)[^\"]*[^>]*>([^<]*)[^\"]*[^>]*>([^<]*)[^f]*floor\">([^<]*)[^m]*material\">([^<]*)[^f]*full\">([^<]*)[^#]*[^m]*more[^\/]*\/span>[^>]*([^\"]*)[^y]*[^-]*-favid=\"(\d*)/ism', $data, $tmp); //$r=preg_match_all('/catalog-product-address\s*\">([^<]*)[^>]*>([^<]*)[^\"]*\s*\"catalog-product-square-2\"\s*>([^<]*)<[^\"]*\"catalog-product-region\">([^<]*)(<)[^f]+floor\">([^<]*)[^=]*[^>]*>([^<]*)[^f]+full\"\s*>([^<]*)[^#]+[^\"]*\"\s*catalog-product-body[^=]*=\s*\"\s*more\s*"\s*>[^>]*>([^\"]*)[^y]+[^-]+-favid=\"(\d*)/ism', $data, $tmp); $r = preg_match_all('/]+\bhref="(.*)")(?=[^>]+\bclass="[^"]*\bcatalog-product\b[^"]*")[^>]*>(.*)<\/a>/isU', $data, $data); /* foreach($tmp as $_){ var_dump($_); } */ $k = count($data[1]); for($i = 0; $i < $k; $i++) { if (preg_match('//')) $item->url = 'https://' . $this->site->host . $data[1][$i]; $item->id = preg_replace('/^.*\/(\d+)\/?$/ism', '$1', $data[1][$i]); preg_match('/catalog\s*-\s*product\s*-\s*floor\s*\"\s*>([^<]*)/ism', $data[2][$i] , $tmp[$i]); $y = trim($tmp[$i][1]); $z = explode("/", $y); $item->level = $z[0]; // $levels = explode(" ", $z[1]); $item->levels = $levels[0]; preg_match('/(\d*)/', $z[1], $_levels); $item->levels = $_levels[1]; preg_match('/catalog\s*-\s*product\s*-\s*material\s*\"\s*>([^<]*)/ism', $data[2][$i] , $tmp[$i]); $item->material = $tmp[$i][1]; preg_match('/price\s*-\s*full\s*\"\s*>([^<]*)/ism', $data[2][$i] , $tmp[$i]); $item->price = str_replace(' ', '', $tmp[$i][1]); preg_match('/catalog\s*-\s*product\s*-\s*square\s*\"\s*>([^<]*)/ism', $data[2][$i] , $tmp[$i]); $item->s = explode(" ", ltrim($tmp[$i][1]))[0]; preg_match('/catalog\s*-\s*product\s*-\s*street\s*\"\s*>([^<]*)/ism', $data[2][$i] , $tmp[$i]); $item->address = trim($tmp[$i][1]); $item->house_number = (explode(", ", trim($tmp[$i][1]))[1]) ? explode(", ", trim($tmp[$i][1]))[1] : ""; preg_match('/catalog\s*-\s*product\s*-\s*address\s*\"\s*>([^<]*)/ism', $data[2][$i] , $tmp[$i]); $item->rooms = $this->getRooms(trim(trim($tmp[$i][1]), ",")); preg_match('/catalog\s*-\s*product\s*-\s*region\s*\"\s*>([^<]*)/ism', $data[2][$i] , $tmp[$i]); $item->raion = explode(", " , $tmp[$i][1])[1]; $item->params = "Количество комнат: ". $item->rooms.", Этаж/Этажность: ".$item->level."/".$item->levels.", Материал: ".$item->material.", Площадь: ".$item->s." m2"; preg_match('/\s*alt\s*=\s*\"([^\"]*)/ism', $data[2][$i] , $tmp[$i]); $item->title = str_replace('&sup', '', $tmp[$i][1]); // $ro = $this->getRooms(trim(trim($tmp[1][$i]), ",")); // var_dump($ro); /* $items[$tmp[10][$i]] = (object)[ id => $tmp[10][$i], url => "https://" . $this->site->host . "/stickers/view/" . $tmp[10][$i], date => 0, price => str_replace(' ', '', $tmp[8][$i]), s => explode(" ", ltrim($tmp[3][$i]))[0], level =>explode("/", ltrim($tmp[6][$i]))[0], levels => explode(" ", explode("/", ltrim($tmp[6][$i]))[1])[0], address => trim($tmp[2][$i]), title => $this->getRooms(trim(trim($tmp[1][$i]), ","))."-комн. квартира, ". trim($tmp[4][$i]).", ".trim($tmp[2][$i]), material => $tmp[7][$i], raion => explode(", " , $tmp[4][$i])[1], house_number => (explode(", ", trim($tmp[2][$i]))[1]) ? explode(", ", trim($tmp[2][$i]))[1] : "", rooms => $this->getRooms(trim(trim($tmp[1][$i]), ",")), params => "Количество комнат: ".$this->getRooms(trim(trim($tmp[1][$i]), ",")).", Этаж/Этажность: ".explode("/", ltrim($tmp[6][$i]))[0]."/". explode(" ", explode("/", ltrim($tmp[6][$i]))[1])[0].", Материал: ".$tmp[7][$i].", Площадь: ".explode(" ", ltrim($tmp[3][$i]))[0], ]; */ $d = preg_match_all('/
([^<]*)/ism', $data[2][$i], $tmp[$i]); if($d){ $tmpDateArray = explode(' ', $tmp[$i][1]); if(stripos($tmpDateArray[1], "минут") === 0) { $item->date = date("Y-m-d H:i:s", time() - $tmpDateArray[0]*60); $this->__log("Minut back {$tmpDateArray[0] }"); } elseif (stripos($tmpDateArray[1], "час") === 0) { $item->date = date("Y-m-d H:i:s", time() - $tmpDateArray[0]*60*60); $this->__log("Chasov back {$tmpDateArray[0] }"); } elseif (stripos($tmpDateArray[0], "вчер") === 0) { $item->date = date("Y-m-d H:i:s", time() - 60*60*24); $this->__log("Chasov back {$tmpDateArray[0] }"); } } else {$item->date = 0; $this->__log("На странице нету даты"); } $n = $i +1; // foreach($items as $item){ $this->__log("Counts {$n} Item id {$item->id} AND url {$item->url} AND {$item->date} AND SQUARE {$item->s} AND PRICE {$item->price} AND ADDREDD {$item->address} AND ROOMS {$item->rooms} AND RAION {$item->raion} AND MATERIAL {$item->material} AND HOUSE_NUM {$item->house_number} AND levl {$item->level} AND LEVELS {$item->levels} \n"); $items[$item->id] = (object)[ id => $item->id, url => $item->url, level => $item->level, levels => $item->levels, material => $item->material, price => $item->price, s => $item->s, address => $item->address, house_number => $item->house_number, rooms => $item->rooms, raion => $item->raion, params => $item->params, title => $item->title, ]; } return $items; } public function getItem($item, $n = 0) { if($n) { $err = "getItem({$item->id}) - error receiving data"; $this->update_proxy(1, -1, $err); if ($n > $this->fails) return $this->__log($err); } $this->__log("getItem: {$item->url}"); $data = $this->get_url($item->url, true); $r=preg_match('/
.*
getItem($item, $n + 1); } $item->display = 1; $item->is_agency = 0; $item->seller = ""; if(preg_match('/Страница не найдена/',$data)) { $this->__log('Sibdom: Такой страницы на нашем сайте нет'); return $this->getItem($item, $n + 1); } $r2 = preg_match("/добавлено([^<]*)|обновлено([^<]*)/ism", $tmp[0], $tmp2); var_dump($tmp2); // if ($item->date==""){ if ($tmp2[1]==0){ $item->date = date ("Y-m-d", strtotime($tmp2[2]))." "."00:00:00";} else $item->date = date ("Y-m-d", strtotime($tmp2[1]))." "."00:00:00"; print_r("DATAAAAAAAAAAAAAAAAAAAA ".$item->date); // } # $item->images = []; $r = preg_match_all("/Фото\s\d*.[^<]*images[] = $tmp2[1][$i]; } $item->images = implode("\n", $item->images); var_dump($item->images); # # $r = preg_match_all("/card-description[^<]*([^s]*)/ism", $tmp[0], $tmp2); $item->description = strip_tags($tmp2[1][0]); var_dump( $item->description); # //preg_match_all('/Площадь кухни[^\d]*([\S]*)/', $tmp[0], $tmp3); //TEST Item // $item = new stdClass(); // $item->id = $this->item->id # preg_match_all('/data-owner=\"([^\"]*)/ism', $tmp[0], $owner); preg_match_all('/data-key=\"([^\"]*)/ism', $tmp[0], $key); preg_match_all('/data-url_key=\"([^\"]*)/ism', $tmp[0], $url_key); /*$item->phone = $this->post_url( "https://www.sibdom.ru/api/get_phone", [ //'okey' => $item->SEC_CODE . ";", "owner" => $owner[1][0], "id" => $item->id, "key" => $key[1][0], "url_key" => $url_key[1][0], "mobile" => 0, ], TRUE, [ "Content-type: application/x-www-form-urlencoded; charset=UTF-8", "X-Requested-With: XMLHttpRequest", "Referer: {$url}", ] ); */ /* $send_data = array( "id" => $item->id, "key" => $key[1][0], "mobile" => 0, "owner" => $owner[1][0], "url_key" => $url_key[1][0], ); // var_dump($send_data); */ $item->phone = $this->getPhone($send_data); //var_dump("!!!!!!!!!!!!!! {$item->phone}"); # isset($this->oldItem[$item-id]) ? $this->updateItem($item) : $this->saveItem($item); return $item; } public function post_url($url, $form = [], $proxy = true, $opts = [], $n = 1) { if (!$url || !preg_match("/^https?[:]\/\//", $url)) { $this->error("post_url: incorrect url = {$url}"); } /* if ($n > $this->fails / 2) { if ($proxy) { $proxy = false; } else{ $proxy = true; $n = 1; } }*/ $query = http_build_query($form); $log = "post_url: url = {$url}&{$query}"; $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_POST, 1); curl_setopt($ch, CURLOPT_POSTFIELDS, $query); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); curl_setopt($ch, file_exists($this->cookies) ? CURLOPT_COOKIEFILE : CURLOPT_COOKIEJAR, $this->cookies); $headers = is_array($opts) ? $opts : [$opts]; $headers[] = "Accept-Language: ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3"; $headers[] = "Cache-Control: max-age=0"; $headers[] = "Connection: keep-alive"; $headers[] = "User-Agent: Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0"; $headers[] = "Content-length: " . strlen($query); if ($proxy) { $this->get_proxy($proxy === true ? "" : $proxy); curl_setopt($ch, CURLOPT_PROXY, $this->proxy); if ($this->__proxy->pass) curl_setopt($ch, CURLOPT_PROXYUSERPWD, $this->__proxy->login . ":" . $this->__proxy->pass); $log .= ", proxy = {$this->proxy}"; $headers[] = "X-Forwarded-For: {$this->__proxy->ip}"; } else { $this->__proxy = null; $this->proxy = null; sleep(2); } $this->__log($log); curl_setopt($ch, CURLOPT_HTTPHEADER, $headers); $ss = curl_exec($ch); $er = curl_error($ch); $ci = curl_getinfo($ch); curl_close($ch); $f = $er || $ci["http_code"] != 200; if ($proxy) { $this->update_proxy($f, $ci["http_code"], $er); } else if (preg_match("/^40\d$/", $ci["http_code"])) { $this->__log("{$log} fails (err = {$er}, http_code = {$ci["http_code"]})"); return false; } if ($f) { $this->__log("{$log} fails (err = {$er}, http_code = {$ci["http_code"]}), retry..."); return $this->post_url($url, $form, $proxy, $opts, $n + 1); } return $ss; } public function getRooms($strRooms) { $strRooms = mb_strtolower($strRooms); $rooms = (int) filter_var($strRooms, FILTER_SANITIZE_NUMBER_INT); if($rooms === 0){ switch($strRooms) { case 'студия': { return 1; } case 'гостинка': { return 1; } case 'комната': { return 0; } } } return $rooms; } public function getPhone($send_data) { sleep (2); $url = "https://sibdom.ru/api/get_phone"; $data = json_encode($send_data, JSON_UNESCAPED_UNICODE); var_dump($data); $headers = array( 'cache-control: max-age=0', 'upgrade-insecure-requests: 1', 'user-agent: Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36', 'sec-fetch-user: ?1', 'accept: application/json, text/javascript, text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'x-compress: null', 'sec-fetch-site: none', 'sec-fetch-mode: navigate', 'accept-encoding: deflate, br', 'accept-language: ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7', 'Content-Type: application/x-www-form-urlencoded; charset=utf-8', 'X-Requested-With: XMLHttpRequest', ); $ch = curl_init('sibdom.ru/api/get_phone'); curl_setopt($ch, CURLOPT_COOKIEFILE, __DIR__ . '/cookie.txt'); curl_setopt($ch, CURLOPT_COOKIEJAR, __DIR__ . '/cookie.txt'); curl_setopt($ch, CURLOPT_HTTPHEADER, $headers); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($ch, CURLOPT_HEADER, true); curl_setopt($ch, CURLOPT_POST, 1); curl_setopt($ch, CURLOPT_POSTFIELDS, $data); $html = curl_exec($ch); curl_close($ch); var_dump($html); // $tmp = $this->post_url($url, true, $data ); //var_dump(json_decode($tmp)); // var_dump(utf8_encode(json_decode(json_encode(json_decode($tmp)->result); //$res = json_decode($tmp)->result; //var_dump($this-> unicode_escape_decode($res)); //die(); $this->__log("Phone parsed: ".$phone); return "Нет телефона"; } function unicode_escape_decode($str) { return html_entity_decode( preg_replace('~\\\u([a-zA-Z0-9]{4})~', '&#x$1;', $str), null, 'UTF-8' ); } public function grab() { print_r("start GRAB"); // $this->deal_type = $this->deals[]; // $category = $this->categories["kvartiry"]; $city = conf::$city->id; $this->session_start(); $this->sections = [[deal_type => "prodam", category => "kvartiry"]]; foreach ($this->sections as $section) { $this->deal_type = $section["deal_type"]; $this->category = $section["category"]; // if(!isset($sections[$deal_type][$category])) continue; $this->__log("\n\n=========\n grab($city, $this->deal_type, $this->category);\n=========\n"); // parent::grab($city, $deal_type, $category); $url = $this->create_url($this->category); $pages = $this->getPages($url); $limit = $this->limit; for ($p = $this->startpage; $p <= $pages; $p++){ $url = $this->create_url($urlsfx,$p); $items = $this->getItems($url); foreach($items as $item) { if(!$this->checkItem($item)) $limit--; else { $limit = $this->limit; $this->items[$item->id] = $item; } } } $n = count($this->items); $this->__log("\n\n===========\n ITEMS END: {$n} \n==========\n\n"); foreach($this->items as $item) { $this->getItem($item) ? null : $this->saveUrl($item); } } $this->_session_end(); return true; } public function create_url($urlsfx, $p = 1) { $url="https://www.sibdom.ru/kvartiry/prodam_krasnoyarsk_ot-sobstvennika/?q=20_dateadd_desc&page={$p}"; //$url="https://www.sibdom.ru/{$this->category}/{$this->deal_type}/?page={$p}" // $url = "_blagoveschensk/{$this->category}/{$this->deal_type}{$urlsfx}"; // if($p > 1) $url ."?page={$p}"; return $url; } public function session_end() { } public function _session_end() { @unlink("{$_ENV[ROOT]}/../pids/{$this->site->code}.pid"); @unlink($this->cookies); } public function get_proxy($proxy = "", $reset = 0) { $proxy = $this->get_proxylist(); $this->__proxy = (object) [ip => "", port => "", login => "", pass => ""]; // $this->proxy = "10.0.0.1:8800"; $this->proxy = $proxy[mt_rand(0, count($proxy) - 1)]; list($this->__proxy->ip, $this->__proxy->port, $this->__proxy->login, $this->__proxy->pass) = explode(":", $this->proxy); $this->proxy = $this->__proxy->ip . ":" . $this->__proxy->port; return $this->proxy; } public function update_proxy($n, $status = "", $error = "") { return false; } }