1, 'Второй' => 2, 'Третий' => 3, 'Четвертый' => 4, 'Пятый' => 5, 'Шестой' => 6, 'Седьмой' => 7, 'Восьмой' => 8, 'Девятый' => 9, 'Десятый' => 10, 'Одиннадцатый' => 11, 'Двенадцатый' => 12, 'Тринадцатый' => 13, 'Четырнадцатый' => 14, 'Пятнадцатый' => 15, 'Шестнадцатый' => 16, 'Семнадцатый' => 17, 'Восемнадцатый' => 18, 'Девятнадцатый' => 19, 'Двадцатый' => 20 ); // abstract, trash public function create_url($urlsfx, $p = 1) { // TODO: Implement create_url() method. } public function getPages($url, $n = 0) { // TODO: Implement getPages() method. } public $timeout = 30; public $check_olditems = TRUE; public function __construct($site_id) { parent::__construct($site_id); } public $cityinurl = ""; public function grab() { $categories = array( 'prodaja-kvartiri' => 'prodam', 'prodaja-komnati' => 'prodam', 'arenda-kvartiri' => 'sdam', 'arenda-komnati' => 'sdam', 'prodaja-doma' => 'prodam', 'prodaja-uchastka' => 'prodam', 'prochie-stroeniya' => 'prodam', 'arenda-doma' => 'sdam', ); //TODO: Это костыль для работы обновлено, правильное решение здесь будет оверрайдить метод create_url и get_pages и адаптировать решение чтобы использовать стандартный метод grab $costulDlyaCategories = array( 'prodaja-kvartiri' => 'kvartiry', 'prodaja-komnati' => 'komnaty', 'arenda-kvartiri' => 'kvartiry', 'arenda-komnati' => 'komnaty', 'prodaja-doma' => 'doma_dachi_kottedzhi', 'prodaja-uchastka' => 'zemelnye_uchastki', 'prochie-stroeniya' => 'kommercheskaya_nedvizhimost', 'arenda-doma' => 'doma_dachi_kottedzhi', ); $arr = [ tomsk => "Томск", tomskaya_oblast_svetlyy => "Светлый", bogashevo => "Богашёво", zonalnaya_stantsiya => "Зональная станция", loskutovo => "Лоскутово", moryakovskiy_zaton => "Моряковский Затон", samus => "Самусь", seversk => "Северск", ]; foreach($arr as $code => $name){ $this->cityinurl = $code; $url = 'https://youla.ru/' . $this->cityinurl . '/nedvijimost/'; foreach ($categories as $category => $dealType) { $page = 0; $this->items = []; $limit = $this->limit; $this->city = $this->cityinurl; $this->deal_type = $dealType; $this->category = $costulDlyaCategories[$category]; while (TRUE) { $page++; $items = $this->getItems($url . $category . '?attributes%5Bsobstvennik_ili_agent%5D%5B0%5D=10705&attributes%5Bsort_field%5D=date_published&page=' . $page); if (empty($items)) { break; } foreach ($items as $item) { if ($this->checkItem($item)) { $limit = $this->limit; $item->deal_type = $dealType; $this->items[$item->id] = $item; } else { $limit--; } if (!$limit) { break 2; } } } $this->__log("\n\n=========\n ITEMS FOUND: " . $category . ' - ' . count($this->items) . "\n=========\n\n"); foreach (array_reverse($this->items) as $item) { if (!$this->getItem($item)) $this->saveUrl($item); } } } @unlink("{$_ENV[ROOT]}/../pids/{$this->site->code_alias}.pid"); } public function getItems($url, $n = 0) { $items = array(); if($n){ $err = "getItems({$url}) - error receiving data"; $this->update_proxy(1, -1, $err); if($n > $this->fails){ $this->__log($err); return []; } } $data = $this->get_url($url, FALSE); $tmp = $data; $tmp = preg_split('/]+class="product_list[^>]+">/ismU',$tmp); $tmp = preg_split('/<\/ul>/ismU',$tmp[1]); if (!$tmp[0]) { if(!preg_match("/alert_message__title/ismU", $data)){ return $this->getItems($url, $n + 1); }else{ return $items; }} // if (!preg_match('/]+class="product_list[^\"]+_board_items">(.+)<\/ul>/ismU', $data, $blockMatch)) { // if(!preg_match("/alert_message__title/ismU", $data)){ // return $this->getItems($url, $n + 1); // }else{ // return $items; // } // } // $this->__log($tmp[0]); //Get items $rgxp = ']+class="product_item"[^>]+>((?!<\/li>).)*<\/li>'; if (!preg_match_all('{' . $rgxp . '}ism', $tmp[0], $matches)) { return $items; } //Get data $rgxp = ']*>\s*cityinurl . '[^"]+)"\s+title="([^"]+)"\s*>\s*' . '.+>([^><]+).+\s*' . '([^<]+)<\/span>([^<]+)<'; foreach($matches[0] as $item_parsed){ if(preg_match('{' . $rgxp . '}ism', $item_parsed, $tmp)){ $items[$tmp[1]] = (object) array( 'id' => $this->getMappedId($tmp[1]), 'date' => $this->get_date($tmp[6], $tmp[5]), 'title' => trim($tmp[3]), 'url' => 'https://youla.ru' . $tmp[2], 'price' => preg_replace('/\D+/', '', trim(html_entity_decode(strip_tags($tmp[4])))), ); } } // // if (!preg_match_all('{' . $rgxp . '}ism', $blockMatch[1], $matches)) { // if (!preg_match_all('{' . $rgxp . '}ism', $tmp[0], $matches)) { // return $items; // } // foreach (array_keys($matches[0]) as $key) { // $items[$matches[1][$key]] = (object) array( // 'id' => $this->getMappedId($matches[1][$key]), // 'date' => $this->get_date($matches[6][$key], $matches[5][$key]), // 'title' => $matches[3][$key], // 'url' => 'https://youla.ru' . $matches[2][$key], // 'price' => preg_replace('/\D+/', '', trim(html_entity_decode(strip_tags($matches[4][$key])))), // ); // } return $items; } public function get_date($date, $time = "") { $time = preg_match('/(\d\d\:\d\d)/', $time, $match) ? $match[1] : '00:00'; $dateObj = date_create_from_format('d.m.Y H:i', $date . ' ' . $time); return $dateObj->format('Y-m-d H:i'); } public function getItem($item, $n = 0) { $item->is_agency = 0; if($n){ $err = "getItem({$item->id}) - error receiving data"; $this->update_proxy(1, -1, $err); if($n > $this->fails) return $this->__log($err); } $data = $this->get_url($item->url, FALSE); /* if (!preg_match("/
    /ismU", $data, $tmp)) { return $this->getItem($item, $n + 1); }*/ if (!preg_match('{window.__YOULA_STATE__ =([^\n]+)}', $data, $tmp)) { return $this->__log("getItem({$item->id}) - error parsing JSON"); } $json = json_decode(trim(trim($tmp[1]), ';'), TRUE); $jsonProduct = $json['entities']['products'][0]; $item->phone = $jsonProduct['owner']['displayPhoneNum']; if (empty($item->phone)) { $item->phone = "Нет телефона"; // $this->__log("error receiving phone"); // return; } else{ $tmpphone = preg_replace("/^7/sm","8",$item->phone); mysql_query("UPDATE grabber.objects SET phone = '{$tmpphone}' WHERE url = '{$item->url}' AND phone = 'Нет телефона'"); mysql_query("UPDATE grabber.objects_links SET phone = REPLACE(phone,'23:\"Нет телефона','11:\"{$tmpphone}') WHERE url = '{$item->url}' AND phone LIKE '%Нет телефона%'"); } // Nope from URL -> see categories in DB $this->_getObjectType($item->url, $item); $this->_getAgency($data, $item); $item->seller = $jsonProduct['owner']['name']; $item->title = $jsonProduct['name']; $item->address = $jsonProduct['location']['description']; $item->description = $jsonProduct['description']; $item->display = 1;//$this->checkPhone($item->phone); $item->distance = is_null($jsonProduct['distance']) ? '' : $jsonProduct['distance']; $this->_getParams($jsonProduct['attributes'], $item); $this->_getImages($jsonProduct['images'], $item); $dt = new DateTime(); $dt->setTimezone(new DateTimeZone($jsonProduct['datePublished']['timezone']['name'])); $dt->setTimestamp($jsonProduct['datePublished']['timestamp']); $item->date = date_create_from_format('d.m.Y H:i:s', $dt->format('d.m.Y H:i:s'))->format('Y-m-d H:i:s'); isset($this->olditems[$item->id]) ? $this->updateItem($item) : $this->saveItem($item); return $item; } private function _getObjectType($data, $item) { if (preg_match('/(arenda|prodaja)\-doma/i', $data)) { $item->category = 'doma_dachi_kottedzhi'; } elseif (preg_match('/prochie\-stroeniya/i', $data)) { $item->category = 'kommercheskaya_nedvizhimost'; } elseif (preg_match('/(arenda|prodaja)\-komnati/i', $data)) { $item->category = 'komnaty'; } elseif (preg_match('/(arenda|prodaja)\-kvartiri/i', $data)) { $item->category = 'kvartiry'; } elseif (preg_match('/prodaja\-uchastka/i', $data)) { $item->category = 'zemelnye_uchastki'; } } private function _getParams($data, $item) { $paramNames = array( 'realty_etaj' => 'Этаж', 'balkon' => 'Балкон', 'realty_infrastructure' => 'Инфраструктура', 'lift' => 'Лифт', 'realty_etajnost_doma' => 'Этажность', 'tip_doma' => 'Тип дома', 'sobstvennik_ili_agent' => 'Собственник или агент', 'komnat_v_kvartire' => 'Количество комнат', 'remont' => 'Ремонт', 'sanuzli' => 'Санузел', 'let_v_sobstvennosti' => 'Лет в собственности', 'realty_obshaya_ploshad' => 'Общая площадь', 'realty_ploshad_kuhni' => 'Площадь кухни', ); $item->raion = ''; $item->land = 0.0; $params = array(); foreach ($data as $attribute) { switch ($attribute['slug']) { case 'realty_etaj': $item->level = $this->_getLevel($attribute['rawValue']); break; case 'realty_etajnost_doma': $item->levels = is_numeric($attribute['rawValue']) ? $attribute['rawValue'] : 0; break; case 'tip_doma': $item->material = empty($attribute['rawValue']) ? '' : $attribute['rawValue']; break; case 'sobstvennik_ili_agent': if ($attribute['rawValue'] != 'Собственник') { $item->is_agency = 1; } break; case 'komnat_v_kvartire': $item->rooms = $this->_getRooms($attribute['rawValue']); break; case 'realty_obshaya_ploshad': $item->s = $attribute['rawValue'] / 100; break; case 'realty_ploshad_kuhni': $params[] = $paramNames['realty_ploshad_kuhni'] . ': ' . ($attribute['rawValue'] / 100); break; default: $params[] = ( isset($paramNames[$attribute['slug']]) ? $paramNames[$attribute['slug']] : $attribute['slug'] ) . ': ' . $attribute['rawValue']; break; } } $item->params = implode('; ', $params); } private function _getLevel($value) { if (is_numeric($value)) { return $value; } elseif (isset($this->level[$value])) { return $this->level[$value]; } return 0; } private function _getRooms($value) { $res = preg_replace('/\D+/', '', $value); return is_numeric($res) ? $res : 0; } private function _getAgency($data, $item) { if (preg_match('{(.+)}ismU', $data, $matches) && (trim($matches[1]) != 'Собственник') ) { $item->is_agency = 1; } } private function _getImages($data, $item) { $item->images = array(); foreach ($data as $image) { $item->images[] = $image['url']; } $item->images = implode("\n", $item->images); } protected function error($err) { die($this->__log($err)); } public function cron() { $logfile = "{$_ENV[ROOT]}/../logs/{$this->site->code_alias}.log"; $pidfile = "{$_ENV[ROOT]}/../pids/{$this->site->code_alias}.pid"; if (is_file($pidfile)) { $pid = file_get_contents($pidfile); shell_exec("kill -9 {$pid} > /dev/null 2>&1"); } $cmd = "php {$_ENV[ROOT]}/grabber.php {$this->site->code_alias} > {$logfile} & echo $!"; // echo "[", date("Y-M-d H:i:s"), "] ", $cmd, "\n"; $pid = shell_exec($cmd); file_put_contents($pidfile, $pid); } public function get_pkey($id, $t) { $t = preg_split("/[^0-9a-f]+/", $t); $t = implode("", $id % 2 ? $t : array_reverse($t)); $t = str_split($t); $r = ''; for ($i = 0; $i < count($t); ++$i) { if ($i % 3 === 0) { $r .= $t[$i]; } } return $r; } public function isValidItemHtml($html, $item) { return preg_match("/\bavito\.item\.phone\s*=\s*['\"]([^'\"]+)['\"]/ismU", $html); } public function get_proxy($proxy = "", $reset = 0) { $proxy = $this->get_proxylist(); $this->__proxy = (object) [ip => "", port => "", login => "", pass => ""]; // $this->proxy = "10.0.0.1:8800"; $this->proxy = $proxy[mt_rand(0, count($proxy) - 1)]; list($this->__proxy->ip, $this->__proxy->port, $this->__proxy->login, $this->__proxy->pass) = explode(":", $this->proxy); $this->proxy = $this->__proxy->ip . ":" . $this->__proxy->port; return $this->proxy; } protected function getMappedId($value) { if (preg_match('/^\d+$/', $value)) return $value; $value = mysql_real_escape_string($value); list($key) = mysql_fetch_array(mysql_query("SELECT `key` FROM objects_id_map WHERE site_id = '{$this->site_id}' AND `value` = '{$value}'")); if ($key) return $key; list($key) = mysql_fetch_array(mysql_query("SELECT MAX(`key`) + 1 FROM objects_id_map WHERE site_id = '{$this->site_id}'")); if (!$key) $key = 1; mysql_query("INSERT INTO objects_id_map(`site_id`, `key`, `value`) VALUES('{$this->site_id}', '{$key}', '{$value}')") or $this->error(mysql_error()); return $key; } }