1, 'Второй' => 2, 'Третий' => 3, 'Четвертый' => 4, 'Пятый' => 5, 'Шестой' => 6, 'Седьмой' => 7, 'Восьмой' => 8, 'Девятый' => 9, 'Десятый' => 10, 'Одиннадцатый' => 11, 'Двенадцатый' => 12, 'Тринадцатый' => 13, 'Четырнадцатый' => 14, 'Пятнадцатый' => 15, 'Шестнадцатый' => 16, 'Семнадцатый' => 17, 'Восемнадцатый' => 18, 'Девятнадцатый' => 19, 'Двадцатый' => 20 ); // abstract, trash public function create_url($urlsfx, $p = 1) { // TODO: Implement create_url() method. } public function getPages($url, $n = 0) { // TODO: Implement getPages() method. } public $timeout = 30; public $check_olditems = TRUE; public function __construct($site_id) { parent::__construct($site_id); $this->numUncorrectObjOnPage = 0; $this->limit = 50000000; $this->datelimit = '2020-01-01 00:00:00'; //date("Y-m-d", time() - 1400) . " 00:00:00"; mysql_query("UPDATE oris.GrabberObjectLinks set NeedInspector =0 where Url like '%youla%' ") or die(mysql_error()); } public $cityinurl = ""; public function grab() { $categories = array( 'prodaja-kvartiri' => 'prodam', 'prodaja-komnati' => 'prodam', 'arenda-kvartiri' => 'sdam', 'arenda-komnati' => 'sdam', 'prodaja-doma' => 'prodam', 'prodaja-uchastka' => 'prodam', 'prochie-stroeniya' => 'prodam', 'arenda-doma' => 'sdam', ); //TODO: Это костыль для работы обновлено, правильное решение здесь будет оверрайдить метод create_url и get_pages и адаптировать решение чтобы использовать стандартный метод grab $costulDlyaCategories = array( 'prodaja-kvartiri' => 'kvartiry', 'prodaja-komnati' => 'komnaty', 'arenda-kvartiri' => 'kvartiry', 'arenda-komnati' => 'komnaty', 'prodaja-doma' => 'doma_dachi_kottedzhi', 'prodaja-uchastka' => 'zemelnye_uchastki', 'prochie-stroeniya' => 'kommercheskaya_nedvizhimost', 'arenda-doma' => 'doma_dachi_kottedzhi', ); $arr = [ krasnoyarsk => "Красноярск", ]; foreach($arr as $code => $name){ $this->cityinurl = $code; $url = 'https://youla.ru/' . $this->cityinurl . '/nedvijimost/'; foreach ($categories as $category => $dealType) { $page = 0; $this->items = []; $limit = $this->limit; $this->city = $this->cityinurl; $this->deal_type = $dealType; $this->category = $costulDlyaCategories[$category]; while (TRUE) { $page++; $items = $this->getItems($url . $category . '?attributes%5Bsobstvennik_ili_agent%5D%5B0%5D=10705&attributes%5Bsort_field%5D=date_published&page='.$page); if ((empty($items)) && ($i<3)) { print_r("Popitka end"); break;} else { return $this->getItems($url, $n + 1); $i++; print_r("Popitka ".$i);} print_r("Popitka end"); break; } foreach ($items as $item) { $this->__log("Current url item = {$item->url}"); // if($item->url cdodergit 'peremennaya' ) { /* if ($this->checkItem($item)) { $limit = $this->limit; $item->deal_type = $dealType; $this->items[$item->id] = $item; } else { $limit--; }*/ // } else continue; if (!$limit) { break 2; } } } mysql_query("UPDATE oris.GrabberObjectLinks set NeedInspector = 2 where NeedInspector=0 and DeleteDate is NULL and Url like '%youla%' ") or die(mysql_error); $this->__log("\n\n=========\n ITEMS FOUND: " . $category . ' - ' . count($this->items) . "\n=========\n\n"); // mysql_query("UPDATE oris.GrabberObjectLinks set NeedInspector = 2 where NeedInspector=0 and DeleteDate is NULL and Url like '%youla%' ") or die(mysql_error); foreach (array_reverse($this->items) as $item) { if (!$this->getItem($item)) $this->saveUrl($item); } } } // $this->doubleCheck(); $this->addIncorrectObjects(); @unlink("{$_ENV[ROOT]}/../pids/{$this->site->code_alias}.pid"); } private function doubleCheck(){ $items4check = mysql_query("SELECT Url from oris.GrabberObjectLinks where NeedInspector = 2 and DeleteDate is NULL and Url like '%youla%' "); $url1=mysql_fetch_row($items4check); for ($i=0; $iget_url($item[0], FALSE); $this->__log("Item for check url {$item[0]}\n CODE HTTP STATUS {$this->http_code} \n"); if (!(preg_match("/продано|Удалено/", $page)) and ($this->http_code != 404)) { print_r("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!".$item[0]); mysql_query("Update oris.GrabberObjectLinks set NeedInspector = 4 where NeedInspector = 2 and DeleteDate is NULL and Url='{$item[0]}' "); } } } } private $uncorrectedObjectLimit = 50; public function getItems($url, $n = 0) { $items = array(); if($n){ $err = "getItems({$url}) - error receiving data"; $this->update_proxy(1, -1, $err); if($n > $this->fails){ $this->__log($err); return []; } } $data = $this->get_url($url, FALSE); $tmp = $data; $tmp = preg_split('/]+class="product_list[^>]+">/ismU',$tmp); $tmp = preg_split('/<\/ul>/ismU',$tmp[1]); if (!$tmp[0]) { if(!preg_match("/alert_message__title/ismU", $data)){ return $this->getItems($url, $n + 1); }else{ return $items; }} // if (!preg_match('/]+class="product_list[^\"]+_board_items">(.+)<\/ul>/ismU', $data, $blockMatch)) { // if(!preg_match("/alert_message__title/ismU", $data)){ // return $this->getItems($url, $n + 1); // }else{ // return $items; // } // } // $this->__log($tmp[0]); //Get items $rgxp = ']+class="product_item[^>]+>((?!<\/li>).)*<\/li>'; if (!preg_match_all('{' . $rgxp . '}ism', $tmp[0], $matches)) { return $items; } //$this->numUncorrectObjOnPage=0; //Get data //$rgxp='/]*>\s*]*>[^\d]*[^\"]*[^<]*<[^<]*<[^<]*[^\d]*[^\"]*[^\d]*([^&]*)[^\.]*[^\"]*[^<]*[^\"]*[^\/]*\/span>([^<]*)/ism'; // $rgxp = ']*>\s*cityinurl . '[^"]+)"\s+title="([^"]+)"\s*>\s*' // . '.+>([^><]+).+\s*' // . '([^<]+)<\/span>([^<]+)<'; $rgxp= ']*[^h]*hidden-xs\">([^<]*)[^\"]*\"visible-xs\">([^<]+)'; foreach($matches[0] as $item_parsed){ if($this->numUncorrectObjOnPage == $this->uncorrectedObjectLimit ) { break; } if(preg_match('{' . $rgxp . '}ism', $item_parsed, $tmp)){ //'/]*>\s*]*>[^\d]*[^\"]*[^<]*<[^<]*<[^<]*[^\d]*[^\"]*[^\d]*([^&]*)[^\.]*[^\"]*[^<]*[^\"]*[^\/]*\/span>([^<]*)/ism'; // if(preg_match_all($rgxp, $item_parsed, $tmp)){ //var_dump("fdsfsdf === ".$tmp[1]); $fl = preg_match('/nedvijimost/', $tmp[2]); if($fl == 0 ){ $this->numUncorrectObjOnPage++; print_r("Unccorrected link\n".$tmp[2]); continue; } $this->numUncorrectObjOnPage = 0; //Является ли полученное объявление с правильной ссылкой // public $isCorrectedObjects = true; //Количество не корректных объяление на одной страницы в методе getItems // public $numUncorrectObjOnPage = 0; $items[$tmp[1]] = (object) array( 'k'=>$tmp[1], 'id' => $this->getMappedId($tmp[1]), 'date' => $this->get_date($tmp[6],$tmp[5]), // 'date' => $this->get_date($tmp[5]), 'title' => trim($tmp[3]), 'url' => 'https://youla.ru' . $tmp[2], 'price' => preg_replace('/\D+/', '', trim(html_entity_decode(strip_tags($tmp[4])))), ); } } var_dump("COUNT: ".count($tmp[1])." AND id ".$tmp[1]); // // if (!preg_match_all('{' . $rgxp . '}ism', $blockMatch[1], $matches)) { // if (!preg_match_all('{' . $rgxp . '}ism', $tmp[0], $matches)) { // return $items; // } // foreach (array_keys($matches[0]) as $key) { // $items[$matches[1][$key]] = (object) array( // 'id' => $this->getMappedId($matches[1][$key]), // 'date' => $this->get_date($matches[6][$key], $matches[5][$key]), // 'title' => $matches[3][$key], // 'url' => 'https://youla.ru' . $matches[2][$key], // 'price' => preg_replace('/\D+/', '', trim(html_entity_decode(strip_tags($matches[4][$key])))), // ); // } $this->__log("=================================================================================="); foreach($items as $item){ //sleep(1); $this->__log("Item url {$item->url} | id {$item->k} | date {$item->date}"); /*if ($item->url = 'https://youla.ru/krasnoyarsk/nedvijimost/prodaja-kvartiri/kvartira-2-komnaty-595-m2-5daf076d0fff8178dc08c803') {$this->__log("SOVPALOOOOOOOOOOOOOOOOO {$item->k}");} if ($item->url='https://youla.ru/krasnoyarsk/nedvijimost/prodaja-kvartiri/kvartira-4-komnaty-1196-m2-5e67694dec98551cea39293e'){ $this->__log("SOVPALOOOOOOOOOOOOOOOOO {$item->k}");} if ($item->url='https://youla.ru/krasnoyarsk/nedvijimost/prodaja-kvartiri/kvartira-2-komnaty-537-m2-606d2b1a274a0c0b6a3e6841') {$this->__log("SOVPALOOOOOOOOOOOOOOOOO {$item->k}");} if ($item->url='https://youla.ru/krasnoyarsk/nedvijimost/prodaja-komnati/komnata-12-m2-5a54ecaac15ae3b7fd6ad2ae') {$this->__log("SOVPALOOOOOOOOOOOOOOOOO {$item->k}");} if ($item->url='https://youla.ru/krasnoyarsk/nedvijimost/prodaja-kvartiri/kvartira-2-komnaty-60-m2-60bd9f4c863ed50bff350b4a') {$this->__log("SOVPALOOOOOOOOOOOOOOOOO {$item->k}");} */ $countIds =mysql_fetch_row( mysql_query("SELECT COUNT(*) FROM oris.GrabberObjectLinks WHERE Url like '%-{$item->k}' and Url like '%youla%' "))[0]; if ($countIds>0) { mysql_query("UPDATE oris.GrabberObjectLinks set NeedInspector =1 where DeleteDate is NULL and Url like '%-{$item->k}' and Url like '%youla%'") or die(mysql_error()); mysql_query("Update oris.GrabberObjectLinks set LinkDate = '{$item->date}' where Url like '%-{$item->k}' and Url like '%youla%'") or die(mysql_error()); } } var_dump("!!!".count($item->url)); return $items; } public function get_date($date, $time = "") { $time = preg_match('/(\d\d\:\d\d)/', $time, $match) ? $match[1] : '00:00'; $dateObj = date_create_from_format('d.m.Y H:i', $date . ' ' . $time); return $dateObj->format('Y-m-d H:i'); } public function getItem($item, $n = 0) { $item->is_agency = 0; if($n){ $err = "getItem({$item->id}) - error receiving data"; $this->update_proxy(1, -1, $err); if($n > $this->fails) return $this->__log($err); } $data = $this->get_url($item->url, FALSE); /* if (!preg_match("/
    /ismU", $data, $tmp)) { return $this->getItem($item, $n + 1); }*/ if (!preg_match('{window.__YOULA_STATE__ =([^\n]+)}', $data, $tmp)) { return $this->__log("getItem({$item->id}) - error parsing JSON"); } $json = json_decode(trim(trim($tmp[1]), ';'), TRUE); $jsonProduct = $json['entities']['products'][0]; $item->phone = $jsonProduct['owner']['displayPhoneNum']; if (empty($item->phone)) { $item->phone = "Нет телефона"; // $this->__log("error receiving phone"); // return; } else{ $tmpphone = preg_replace("/^7/sm","8",$item->phone); mysql_query("UPDATE grabber.objects SET phone = '{$tmpphone}' WHERE url = '{$item->url}' AND phone = 'Нет телефона'"); mysql_query("UPDATE grabber.objects_links SET phone = REPLACE(phone,'23:\"Нет телефона','11:\"{$tmpphone}') WHERE url = '{$item->url}' AND phone LIKE '%Нет телефона%'"); } // Nope from URL -> see categories in DB $this->_getObjectType($item->url, $item); $this->_getAgency($data, $item); $item->seller = $jsonProduct['owner']['name']; $item->title = $jsonProduct['name']; $item->address = $jsonProduct['location']['description']; $item->description = $jsonProduct['description']; $item->display = 1;//$this->checkPhone($item->phone); $item->distance = is_null($jsonProduct['distance']) ? '' : $jsonProduct['distance']; $this->_getParams($jsonProduct['attributes'], $item); $this->_getImages($jsonProduct['images'], $item); $dt = new DateTime(); // $dt->setTimezone(new DateTimeZone($jsonProduct['datePublished']['timezone']['name'])); $dt->setTimestamp($jsonProduct['datePublished']['timestamp']); $item->date = date_create_from_format('d.m.Y H:i:s', $dt->format('d.m.Y H:i:s'))->format('Y-m-d H:i:s'); isset($this->olditems[$item->id]) ? $this->updateItem($item) : $this->saveItem($item); return $item; } private function _getObjectType($data, $item) { if (preg_match('/(arenda|prodaja)\-doma/i', $data)) { $item->category = 'doma_dachi_kottedzhi'; } elseif (preg_match('/prochie\-stroeniya/i', $data)) { $item->category = 'kommercheskaya_nedvizhimost'; } elseif (preg_match('/(arenda|prodaja)\-komnati/i', $data)) { $item->category = 'komnaty'; } elseif (preg_match('/(arenda|prodaja)\-kvartiri/i', $data)) { $item->category = 'kvartiry'; } elseif (preg_match('/prodaja\-uchastka/i', $data)) { $item->category = 'zemelnye_uchastki'; } } private function _getParams($data, $item) { $paramNames = array( 'realty_etaj' => 'Этаж', 'balkon' => 'Балкон', 'realty_infrastructure' => 'Инфраструктура', 'lift' => 'Лифт', 'realty_etajnost_doma' => 'Этажность', 'tip_doma' => 'Тип дома', 'sobstvennik_ili_agent' => 'Собственник или агент', 'komnat_v_kvartire' => 'Количество комнат', 'remont' => 'Ремонт', 'sanuzli' => 'Санузел', 'let_v_sobstvennosti' => 'Лет в собственности', 'realty_obshaya_ploshad' => 'Общая площадь', 'realty_ploshad_kuhni' => 'Площадь кухни', ); $item->raion = ''; $item->land = 0.0; $params = array(); foreach ($data as $attribute) { switch ($attribute['slug']) { case 'realty_etaj': $item->level = $this->_getLevel($attribute['rawValue']); break; case 'realty_etajnost_doma': $item->levels = is_numeric($attribute['rawValue']) ? $attribute['rawValue'] : 0; break; case 'tip_doma': $item->material = empty($attribute['rawValue']) ? '' : $attribute['rawValue']; break; case 'sobstvennik_ili_agent': if ($attribute['rawValue'] != 'Собственник') { $item->is_agency = 1; } break; case 'komnat_v_kvartire': $item->rooms = $this->_getRooms($attribute['rawValue']); break; case 'realty_obshaya_ploshad': $item->s = $attribute['rawValue'] / 100; break; case 'realty_ploshad_kuhni': $params[] = $paramNames['realty_ploshad_kuhni'] . ': ' . ($attribute['rawValue'] / 100); break; default: $params[] = ( isset($paramNames[$attribute['slug']]) ? $paramNames[$attribute['slug']] : $attribute['slug'] ) . ': ' . $attribute['rawValue']; break; } } $item->params = implode('; ', $params); } private function _getLevel($value) { if (is_numeric($value)) { return $value; } elseif (isset($this->level[$value])) { return $this->level[$value]; } return 0; } private function _getRooms($value) { $res = preg_replace('/\D+/', '', $value); return is_numeric($res) ? $res : 0; } private function _getAgency($data, $item) { if (preg_match('{(.+)}ismU', $data, $matches) && (trim($matches[1]) != 'Собственник') ) { $item->is_agency = 1; } } private function _getImages($data, $item) { $item->images = array(); foreach ($data as $image) { $item->images[] = $image['url']; } $item->images = implode("\n", $item->images); } protected function error($err) { die($this->__log($err)); } public function cron() { $logfile = "{$_ENV[ROOT]}/../logs/{$this->site->code_alias}.log"; $pidfile = "{$_ENV[ROOT]}/../pids/{$this->site->code_alias}.pid"; if (is_file($pidfile)) { $pid = file_get_contents($pidfile); shell_exec("kill -9 {$pid} > /dev/null 2>&1"); } $cmd = "php {$_ENV[ROOT]}/grabber.php {$this->site->code_alias} > {$logfile} & echo $!"; // echo "[", date("Y-M-d H:i:s"), "] ", $cmd, "\n"; $pid = shell_exec($cmd); file_put_contents($pidfile, $pid); } public function get_pkey($id, $t) { $t = preg_split("/[^0-9a-f]+/", $t); $t = implode("", $id % 2 ? $t : array_reverse($t)); $t = str_split($t); $r = ''; for ($i = 0; $i < count($t); ++$i) { if ($i % 3 === 0) { $r .= $t[$i]; } } return $r; } public function isValidItemHtml($html, $item) { return preg_match("/\bavito\.item\.phone\s*=\s*['\"]([^'\"]+)['\"]/ismU", $html); } /* public function get_proxy($proxy = "", $reset = 0) { $this->__proxy = (object) [ip => "", port => ""]; $this->proxy = "10.0.0.1:8800"; list($this->__proxy->ip, $this->__proxy->port) = explode(":", $this->proxy); return $this->proxy; } */ public function addIncorrectObjects() { $actualObjects = mysql_query( "Select ObjectId, Url from oris.GrabberObjectLinks t1 where t1.DeleteDate is not Null and t1.LinkDate>t1.DeleteDate and t1.ObjectId in (select t2.object_id FROM oris.oris_objects t2 where t2.deleted=1)"); $objects = mysql_query("Select ObjectId, Url from oris.GrabberObjectLinks where NeedInspector=2 and Url like '%youla%' and (LinkDate>DATE_ADD(NOW(), INTERVAL -30 Day)or LinkDate is Null or LinkDate='0000-00-00 00:00:00')"); mysql_query("Delete from oris.oris_incorrect_objects_request where reason like '%youla%' and sender_user_id='-2' and corrected=0"); while($actualObject = mysql_fetch_row($actualObjects)) { $countOdj2=mysql_fetch_row(mysql_query("SELECT COUNT(*) FROM oris.GrabberObjectLinks WHERE Url = '{$actualObject[1]}' and DeleteDate>LinkDate")); $countObj = mysql_fetch_row(mysql_query("SELECT COUNT(*) FROM oris.GrabberObjectLinks WHERE Url = '{$actualObject[1]}' and ObjectId in (select t2.object_id FROM oris.oris_objects t2 where t2.deleted=0)")); if( $countObj[0] == 0 && $countObj2[0] == 0){ // $countObj = mysql_fetch_row(mysql_query("SELECT COUNT(*) FROM oris.GrabberObjectLinks WHERE Url = '{$actualObject[1]}' and ObjectId in (select t2.object_id FROM oris.oris_objects t2 where t2.deleted=0)")); //if( $countObj[0] == 0){ $reason = "На сайте объект актуален. Нужно прозвонить. ".date('d-m-Y H:i').". ".$actualObject[1]; mysql_query("INSERT INTO `oris`.`oris_incorrect_objects_request` ( `object_id`, `sender_user_id`, `sender_name`, `reason`) VALUES ( '{$actualObject[0]}', '-1', 'Граббер. Инспектор', '{$reason}')") or die(mysql_error()); mysql_query("Update oris.GrabberObjectLinks set DeleteDate = Null where ObjectId = '{$actualObject[0]}'") or die(mysql_error()); } } while($object = mysql_fetch_row($objects)) { $reason = "На сайте сняли с продажи. Нужно прозвонить. ".date('d-m-Y H:i').". ".$object[1]; mysql_query("INSERT INTO `oris`.`oris_incorrect_objects_request` ( `object_id`, `sender_user_id`, `sender_name`, `reason`) VALUES ( '{$object[0]}', '-2', 'Граббер. Инспектор', '{$reason}')") or die(mysql_error()); } // print_r($object); } protected function getMappedId($value) { if (preg_match('/^\d+$/', $value)) return $value; print_r("=================================\n"); // if($value='5daf076d0fff8178dc08c803') {var_dump("SOVPALO" .$value);} // die(); $value = mysql_real_escape_string($value); list($key) = mysql_fetch_array(mysql_query("SELECT `key` FROM objects_id_map WHERE site_id = '{$this->site_id}' AND `value` = '{$value}'")); if ($key) return $key; //эксперимент // list($key) = mysql_fetch_array(mysql_query("SELECT MAX(`id`) + 1 FROM objects")); //print_r("!@!!@!@!@!@!@@!".$key); list($key) = mysql_fetch_array(mysql_query("SELECT MAX(`key`) + 1 FROM objects_id_map WHERE site_id = '{$this->site_id}'")); var_dump($key); if (!$key) $key = 1; mysql_query("INSERT INTO objects_id_map(`site_id`, `key`, `value`) VALUES('{$this->site_id}', '{$key}', '{$value}')") or $this->error(mysql_error()); //die(); return $key; } }