timeout = 30; $this->check_olditems = 1; $this->limit = 150; //$this->cookies = ""; $this->datelimit = date("Y-m-d", time() - 1 * 86400) . " 00:00:00"; $this->deals = [ # single city prodam => "prodazha", sdam => "arenda", # multiple cities // prodam => "prodazha-nedvizhimosti", // sdam => "arenda-nedvizhimosti", ]; $this->categories = [ kvartiry => "kvartiry", komnaty => "komnaty", doma_dachi_kottedzhi => "doma_dachi_kottedzhi", zemelnye_uchastki => "zemelnye_uchastki", kommercheskaya_nedvizhimost => "kommercheskaya_nedvizhimost", ]; //http://m.domofond.ru/ $this->sections = [ prodam => [ kvartiry => "kvartiry", komnaty => "komnaty", // doma_dachi_kottedzhi => "doma", // zemelnye_uchastki => "uchastkizemli", // kommercheskaya_nedvizhimost => "kommercheskoynedvizhimosti", ], sdam => [ // kvartiry => "kvartiry", // komnaty => "komnaty", // doma_dachi_kottedzhi => "doma", // kommercheskaya_nedvizhimost => "kommercheskoynedvizhimosti", ], ]; } public static function isValidPageHTML($html){ // $r = preg_match('/]+\bid="resultsPageDiv"[^>]*>.*]+\bid="ListingResults"[^>]*>/is', $html); $r = preg_match('/\"resultType\"\:\"Listing\"/is', $html); return $r; } public function _getPages($url, $n = 0){return 1; if($n){ $err = "getPages({$url}) - error receiving data"; $this->update_proxy(1, -1, $err); if($n > 3) return $this->__log($err); } // $file = "{$_ENV[ROOT]}/{$this->site->code}.html"; $this->data = $this->get_url($url, true); // file_put_contents($file, $this->data); // $this->data = file_get_contents($file); if(!self::isValidPageHTML($this->data)) return $this->_getPages($url, $n + 1); $r = preg_match('@]*class="[^"]*pagination__mainPages[^"]*"[^>]*>(.*)@isU', $this->data, $tmp); if(!$r) return 1; $r = preg_match_all('@]*>\s*]+>\s*(\d+)\s*\s*@isU', $tmp[1], $tmp); if(!$r) return 1; $p = max($tmp[1]); $max = $this->category == "kvartiry" ? 10 : 5; // return $p > $max ? $max : $p; return $p; } public function getPages($url, $n = 0) { $p = $this->_getPages($url, $n); echo "getPages: ({$p})" . PHP_EOL; return $p; } public function getItems($url, $n = 0){ if($n){ $err = "getItems({$url}) - error receiving data"; $this->update_proxy(1, -1, $err); if($n > 3){ $this->__log($err); return []; } } $data = $this->data ? $this->data : $this->get_url($url, true); /* $file = "{$_ENV[ROOT]}/{$this->site->code}-{$this->deal_type}-{$this->category}.html"; if(!is_file($file)){ $data = $this->data ? $this->data : $this->get_url($url, false); file_put_contents($file, $data); } else{ $data = file_get_contents($file); } */ $this->data = null; if(!self::isValidPageHTML($data)) return $this->getItems($url, $n + 1); // $fake = []; // $rx = '\s*\.([^{]+)\s*{(?=[^}]*\bleft\s*:\s*-\d+px\s*;)(?=[^}]*\bposition\s*:\s*absolute\s*;)[^}]*}\s*'; // if(preg_match("/]*>(?:{$rx}){1,}<\/style>/ismU", $data, $tmp) && preg_match_all("/{$rx}/ismU", $tmp[0], $tmp)){ // $fake = $tmp[1]; // } // $rx = '@]+\bclass="([^"]*\bjs_listingTileContainer\b[^"]*)"[^>]*>\s*]+>@isU'; // $r = preg_match_all($rx, $data, $tmp); // if(!$r) return []; // $data = preg_split($rx, $data, -1, PREG_SPLIT_DELIM_CAPTURE); // $n = 2 * $r; // unset($data[0]); // $data[$n] = preg_split('@]+\bclass="b-pager\b[^"]*"[^>]*>@', $data[$n]); // $data[$n] = $data[$n][0]; // $items = []; // for($j = 2; $j <= $n; $j += 2){ // $f = false; // $s = $data[$j - 1]; // foreach($fake as $_){ // $f = preg_match("/\b{$_}\b/isU", $s); // if($f) break; // } // if($f) continue; // $_ = $data[$j]; // $item = new stdClass(); // if(preg_match('@]*\bitemprop="?sameAs"?)(?=[^>]*\bhref="(.*)")(?=[^>]*\b(?:title|aria-label)="(.*)")[^>]*>@isU', $_, $tmp)){ // $item->url = "https://" . $this->site->host . $tmp[1]; // $item->title = $tmp[2]; // if(preg_match('/\b(\d+)$/isU', $item->url, $tmp)) $item->id = $tmp[1]; // } // if(preg_match('@\bitemprop="?price"?[^>]*>(.*)<@isU', $_, $tmp)){ // $item->price = preg_replace('/\D+/isu', '', html_entity_decode($tmp[1])); // } // if(preg_match('@\bitemprop="?address"?[^>]*>(.*)address = trim(strip_tags(html_entity_decode($tmp[1]))); // $s = preg_replace('@,\s*\b' . conf::$city->name . '\b,.*$@isuU', '', $s); // if($s > '') $item->address = $s; // } // if($item->id) $items[$item->id] = $item; // } $r = preg_match('@@isU',$data,$tmp); if(!$r) { $this->__log('Not found any objects'); return []; } $jsonItems = json_decode($tmp[1]); $items = []; foreach($jsonItems->itemsState->items as $mydata){ $item = new stdClass(); $item->id = $mydata->id; $item->url = "https://" . $this->site->host . $mydata->itemUrl; $item->title = $mydata->title; $item->price = $mydata->priceValue; $item->address = conf::$city->name . " " . $mydata->address; $item->description = $mydata->description; $item->seller = $mydata->agencyName; $item->rooms = $mydata->roomsOrdinal; $item->s = $mydata->floorAreaCalculated; if($item->id) $items[$item->id] = $item; } return $items; } public function isValidItemHTML($item, $html){ // $r = preg_match('@]+\bclass="[^"]*\bb-breadcrumb\b[^"]*"[^>]*>.*]*>Номер в каталоге: ' . $item->id . '.*@isuU', $html); $r = preg_match('/\"resultType\"\:\"Listing\"/is', $html); return $r; } public function getItem($item, $n = 0){ if($n){ $err = "getItem({$item->id}) - error receiving data"; $this->update_proxy(1, -1, $err); if($n > 3) return $this->__log($err); } $data = $this->get_url($item->url, true); $file = "{$_ENV[ROOT]}/{$this->site->code}-{$item->id}.html"; /* if(file_exists($file)) $data = file_get_contents($file); else { $data = $this->get_url($item->url, false); file_put_contents($file, $data); } */ if(!$this->isValidItemHTML($item, $data)) return $this->getItem($item, $n + 1); // if(preg_match('@]+\bclass="[^"]*\bb-listing-details\b[^"]*"[^>]*>\s*]*>(.*)@isU', $data, $tmp)){ // $item->title = trim(strip_tags(html_entity_decode($tmp[1]))); // } // if(preg_match('@\b[^>]+\bitemprop="?description"?[^>]*>(.*)description = trim(strip_tags(html_entity_decode($tmp[1]))); // } // # // $params = []; // preg_match_all('@]*\bclass="e-table-column"[^>]*>\s*]*>(.*)\s*@isU', $data, $tmp); // $tmp = implode('', $tmp[1]); // $r = preg_match_all('@]*>.*]*>(.*)(.*)@isU', $tmp, $tmp); // for($i = 0; $i < $r; $i++) { // $k = trim(strip_tags(html_entity_decode($tmp[1][$i]))); // $k = mb_strtolower(preg_replace('/\s*[:]$/is', '', $k), 'utf-8'); // $v = trim(strip_tags(html_entity_decode($tmp[2][$i]))); // $params[$k] = $v; // } // if(isset($params["комнаты"])){ // $item->rooms = intval($params["комнаты"]); // if(preg_match('/^(?:Комната|Студия)\b/isu', $item->title)) $item->rooms = 1; // } // if(isset($params["материал здания"])) $item->material = $params["материал здания"]; // if(isset($params["дата публикации объявления"]) || $params["дата обновления объявления"]) { // $item->date = isset($params["дата обновления объявления"]) ? $params["дата обновления объявления"] : $params["дата публикации объявления"]; // $item->date = implode("-", array_reverse(explode("/", $item->date))); // $item->date .= " 00:00:00"; // } // if(isset($params["тип объекта"])) $item->object_type = $params["тип объекта"]; // if(isset($params["этаж"])) { // $tmp = explode("/", $params["этаж"]); // $item->level = intval($tmp[0]); // $item->levels = intval($tmp[1]); // } // if(isset($params["площадь"])) $item->s = __floatval($params["площадь"]); // $item->params = ""; // unset($params["номер в каталоге"]); // unset($params["дата публикации объявления"]); // unset($params["дата обновления объявления"]); // foreach($params as $k => $v){ // $k .= ": "; // $k = mb_strtoupper(mb_substr($k, 0, 1, "utf-8"), "utf-8") . mb_substr($k, 1, -1, "utf-8"); // $item->params .= $k . $v . "; "; // } // $item->params = trim($item->params); // if(!$item->level && preg_match('@\b(\d+)\s*/\s*(\d+)\s+эт@isuU', $item->title, $tmp)){ // $item->level = $tmp[1]; // $item->levels = $tmp[2]; // } // if(!$item->s && preg_match('@\b(\d+(?:\.\d+)?)\b\s*м²?@isuU', $item->title, $tmp)){ // $item->s = __floatval($tmp[1]); // } // # // # // $item->images = []; // if(preg_match('@]+\bclass="[^"]*\bjs_mediaContainer\b[^"]*"[^>]*>(.*)@isU', $data, $tmp)){ // preg_match_all('@]+\bdata-original="(.*)"[^>]*>@isU', $tmp[1], $tmp); // $item->images = $tmp[1]; // foreach($item->images as &$_) { // $_ = preg_replace('@/\d+x\d+$@', '', $_); // } // unset($_); // } // $item->images = implode("\n", $item->images); // # // $item->display = 0; // $item->phone = ""; // # // $item->phone = []; // $r = preg_match('@]+\bdata-url="(.*)"[^>]*>\s*]+\bclass="[^"]*\btoggleContactNumberLabel\b[^"]*"[^>]*>@isU', $data, $tmp); // $err = "getItem({$item->id}): error responsing phone number"; // if(!$r) return $this->error($err . " (1)"); // $url = "https://" . $this->site->host . $tmp[1]; // $tmp = $this->post_url( // $url, // true, // [ // "X-Requested-With: XMLHttpRequest", // "User-Agent: Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Mobile Safari/537.36", // "Referer: {$item->url}", // ] // ); // $r = preg_match('/]+\bdata-telno="([^"]+)"[^>]*>/isU', $tmp, $tmp); // if(!$r) return $this->error($err . " (2)"); // $item->phone[] = preg_replace("/\D+/", "", $tmp[1]); // # // # // #
Мария
// $r = preg_match('@]+\bitemprop="?\bseller\b"?[^>]*>.*]+\bitemprop="?\bname\b"?[^>]*>(.*)@isU', $data, $tmp); // if ($r) $item->seller = html_entity_decode($tmp[1]); // #
$r = preg_match('@@isU',$data,$tmp); if(!$r) { $this->__log('Not found object details'); return $this->getItem($item, $n + 1); } $jsonItem = json_decode($tmp[1]); $myitem = $jsonItem->itemState->item; $item->title = $myitem->title; $item->description = $myitem->description; $item->rooms = $myitem->roomsOrdinal; $item->address = $myitem->address; if(preg_match('/^(?:Комната|Студия)\b/isu', $item->title)) $item->rooms = 1; $item->date = date('Y-m-d h:i:s', strtotime($myitem->updatedAt)); //$item->object_type = $myitem->detailGroups[0]->details.Where name == "Тип объекта" ?.Value;//TODO: преобразовать $item->level = $myitem->floorInt; if(preg_match('@^(\d+)\/(\d+)$@isU',$item->floorString,$tmp)){ $item->level = intval($tmp[1]); $item->levels = intval($tmp[2]); } $item->s = $myitem->floorAreaCalculated; $item->images = []; $ii = []; foreach($myitem->galleries as $galleries){ if($galleries->galleryType = "PhotoGallery"){ foreach($galleries->images as $images){ $max = 0; foreach($images as $image){ $width = intval($image->size->width); if($width > $max){ $max = $width; } } foreach($images as $image){ $width = intval($image->size->width); if($width === $max){ $ii[] = $image->url; } } } } } $item->images = array_values($ii); $item->images = implode("\n", $item->images); $postResult = $this->post_url("https://api.domofond.ru/rpc", '{"id":"1","jsonrpc":"2.0","method":"item.GetItemPhoneV1","params":{"meta":{"platform":"web","language":"ru"},"id":'.$item->id.',"itemType":"Listing"}}', true); if($postResult && json_decode($postResult)->result->phone){ $phone = json_decode($postResult)->result->phone; $this->__log("Phone parsed: ".$phone); $item->phone = str_replace('+7','8',$phone); }else{ return $this->__log("Phone parse error"); } $item->seller = $myitem->agent->name; $item->display = $this->checkPhone($item->phone); // $item->phone = implode("\n", $item->phone); $this->saveItem($item); return $item; } public function create_url($urlsfx, $p = 1){ $q = [ // CityIds => "2254", PropertyTypeDescription => $urlsfx, RentalRate => "Month", PrivateListingType => "PrivateOwner", Page => $p, SortOrder => "Newest", ]; if($this->deal_type == "prodam") unset($q["RentalRate"]); if($q["Page"] == 1) unset($q["Page"]); # sigle city return "https://{$this->site->host}/{$this->deals[$this->deal_type]}-{$urlsfx}- tomsk-c1942?" . http_build_query($q); # multiple cities return "https://{$this->site->host}/{$this->deals[$this->deal_type]}/search?" . http_build_query($q); } public function get_url($url, $proxy = true, $opts = [], $n = 1) { if(!$url || !preg_match("/^https?[:]\/\//",$url)) return $this->error("get_url: incorrect url = {$url}"); if($n > $this->fails / 3) { if($proxy) $proxy = false; else{ $proxy = true; $n = 1; } } $log = "get_url: url = {$url}"; $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER,1); curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); curl_setopt($ch, CURLOPT_COOKIEFILE, $this->cookies); curl_setopt($ch, CURLOPT_COOKIEJAR, $this->cookies); $headers = []; $headers = is_array($opts) ? $opts : [$opts]; $headers[] = "Accept-Language: ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3"; $headers[] = "Cache-Control: max-age=0"; $headers[] = "Connection: keep-alive"; $headers[] = "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36"; if($proxy){ curl_setopt($ch, CURLOPT_MAXREDIRS, 2); $this->get_proxy($proxy === true ? "" : $proxy); curl_setopt($ch, CURLOPT_PROXY, $this->proxy); if ($this->__proxy->pass) curl_setopt($ch, CURLOPT_PROXYUSERPWD, $this->__proxy->login . ":" . $this->__proxy->pass); $log .= ", proxy = {$this->proxy}"; $headers[] = "X-Forwarded-For: {$this->__proxy->ip}"; } else{ $this->__proxy = null; $this->proxy = null; sleep(3); } $this->__log($log); curl_setopt($ch, CURLOPT_HTTPHEADER, $headers); $ss = curl_exec($ch); $er = curl_error($ch); $ci = curl_getinfo($ch); curl_close($ch); $f = $er || $ci["http_code"] != 200; if($proxy) $this->update_proxy($f, $ci["http_code"], $er); else if($ci["http_code"] == 404) return false; if($f){ $this->__log("{$log} fails (err = {$er}, http_code = {$ci["http_code"]}), retry..."); return $this->get_url($url, $proxy, $opts, $n + 1); } return $ss; } public function post_url($url, $data, $proxy = true, $opts = [], $n = 1) { if(!$url || !preg_match("/^https?[:]\/\//",$url)) return $this->error("post_url: incorrect url = {$url}"); if($n > $this->fails / 2) { if($proxy) $proxy = false; else{ $proxy = true; $n = 1; } } $log = "post_url: url = {$url}"; $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_POST, 1); curl_setopt($ch, CURLOPT_POSTFIELDS, $data); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); // curl_setopt($ch, file_exists($this->cookies) ? CURLOPT_COOKIEFILE : CURLOPT_COOKIEJAR, $this->cookies); $headers = []; $headers = is_array($opts) ? $opts : [$opts]; $ua = false; foreach ($headers as $_) { if (preg_match("/^User-Agent:/i", $_)) { $ua = true; break; } } $headers[] = "Accept-Language: ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3"; $headers[] = "Cache-Control: max-age=0"; $headers[] = "Connection: keep-alive"; $headers[] = "Content-type: text/plain"; if (!$ua) $headers[] = "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36"; if($proxy){ $this->get_proxy($proxy === true ? "" : $proxy); curl_setopt($ch, CURLOPT_PROXY, $this->proxy); if ($this->__proxy->pass) curl_setopt($ch, CURLOPT_PROXYUSERPWD, $this->__proxy->login . ":" . $this->__proxy->pass); $log .= ", proxy = {$this->proxy}"; $headers[] = "X-Forwarded-For: {$this->__proxy->ip}"; } else{ $this->__proxy = null; $this->proxy = null; sleep(2); } $this->__log($log); curl_setopt($ch, CURLOPT_HTTPHEADER, $headers); $ss = curl_exec($ch); $er = curl_error($ch); $ci = curl_getinfo($ch); curl_close($ch); $f = $er || $ci["http_code"] != 200; if($proxy) $this->update_proxy($f, $ci["http_code"], $er); else if(preg_match("/^40\d$/", $ci["http_code"])){ $this->__log("{$log} fails (err = {$er}, http_code = {$ci["http_code"]})"); return false; } if($f){ $this->__log("{$log} fails (err = {$er}, http_code = {$ci["http_code"]}), retry..."); return $this->post_url($url, $proxy, $opts, $n + 1); } return $ss; } public function _grab($city, $deal_type, $category){ if( !isset($this->cities[$city]) || !isset($this->deals[$deal_type]) || !isset($this->categories[$category]) || !isset($this->sections[$deal_type][$category]) ) return false; $this->city = $city; $this->deal_type = $deal_type; $this->category = $category; $this->session_start(); $this->items = []; $sections = is_array($this->sections[$deal_type][$category]) ? $this->sections[$deal_type][$category] : [$this->sections[$deal_type][$category]]; foreach($sections as $urlsfx){ $url = $this->create_url($urlsfx); $pages = $this->getPages($url); $limit = $this->limit; for ($p = 1; $p <= $pages; $p++){ $url = $this->create_url($urlsfx, $p); $items = $this->getItems($url); foreach($items as $item){ if(!$this->checkItem($item)) $limit--; else{ $limit = $this->limit; $this->items[$item->id] = $item; } if(!$limit) break; } if(!$limit) break; } } $n = count($this->items); $this->__log("\n\n=========\n ITEMS FOUND: {$n}\n=========\n\n"); $this->items = array_reverse($this->items); foreach($this->items as $item){ $this->getItem($item); } return true; } public function session_start() { $this->cookies = "{$_ENV[ROOT]}/../logs/{$this->site->code}.cookie"; @unlink($this->cookies); } public function session_end() { } public function _session_end() { @unlink("{$_ENV[ROOT]}/../pids/{$this->site->code}.pid"); @unlink($this->cookies); } public function grab() { foreach($this->cities as $city => $city_id) { foreach($this->sections as $deal_type => $categories) { foreach($categories as $category => $params) { parent::grab($city, $deal_type, $category); } } } $this->_session_end(); } public function cron() { $logfile = "{$_ENV[ROOT]}/../logs/{$this->site->code}.log"; $pidfile = "{$_ENV[ROOT]}/../pids/{$this->site->code}.pid"; if(is_file($pidfile)){ $pid = file_get_contents($pidfile); shell_exec("kill -9 {$pid} > /dev/null 2>&1"); } $cmd = "php {$_ENV[ROOT]}/grabber.php {$this->site->code} > {$logfile} & echo $!"; // echo "[",date("Y-M-d H:i:s"),"] ", $cmd ,"\n"; $pid = shell_exec($cmd); file_put_contents($pidfile, $pid); } public function get_proxy($proxy = "", $reset = 0) { $proxy = $this->get_proxylist(); $this->__proxy = (object) [ip => "", port => "", login => "", pass => ""]; // $this->proxy = "10.0.0.1:8800"; $this->proxy = $proxy[mt_rand(0, count($proxy) - 1)]; list($this->__proxy->ip, $this->__proxy->port, $this->__proxy->login, $this->__proxy->pass) = explode(":", $this->proxy); $this->proxy = $this->__proxy->ip . ":" . $this->__proxy->port; return $this->proxy; } public function update_proxy($n, $status = "", $error = "") { return false; } }