1,
'Второй' => 2,
'Третий' => 3,
'Четвертый' => 4,
'Пятый' => 5,
'Шестой' => 6,
'Седьмой' => 7,
'Восьмой' => 8,
'Девятый' => 9,
'Десятый' => 10,
'Одиннадцатый' => 11,
'Двенадцатый' => 12,
'Тринадцатый' => 13,
'Четырнадцатый' => 14,
'Пятнадцатый' => 15,
'Шестнадцатый' => 16,
'Семнадцатый' => 17,
'Восемнадцатый' => 18,
'Девятнадцатый' => 19,
'Двадцатый' => 20
);
// abstract, trash
public function create_url($urlsfx, $p = 1)
{
// TODO: Implement create_url() method.
}
public function getPages($url, $n = 0)
{
// TODO: Implement getPages() method.
}
public $timeout = 30;
public $check_olditems = TRUE;
public function __construct($site_id)
{
parent::__construct($site_id);
}
public $cityinurl = "";
public function grab()
{
$categories = array(
'prodaja-kvartiri' => 'prodam',
'prodaja-komnati' => 'prodam',
'arenda-kvartiri' => 'sdam',
'arenda-komnati' => 'sdam',
'prodaja-doma' => 'prodam',
'prodaja-uchastka' => 'prodam',
'prochie-stroeniya' => 'prodam',
'arenda-doma' => 'sdam',
);
//TODO: Это костыль для работы обновлено, правильное решение здесь будет оверрайдить метод create_url и get_pages и адаптировать решение чтобы использовать стандартный метод grab
$costulDlyaCategories = array(
'prodaja-kvartiri' => 'kvartiry',
'prodaja-komnati' => 'komnaty',
'arenda-kvartiri' => 'kvartiry',
'arenda-komnati' => 'komnaty',
'prodaja-doma' => 'doma_dachi_kottedzhi',
'prodaja-uchastka' => 'zemelnye_uchastki',
'prochie-stroeniya' => 'kommercheskaya_nedvizhimost',
'arenda-doma' => 'doma_dachi_kottedzhi',
);
$arr = [
tomsk => "Томск",
tomskaya_oblast_svetlyy => "Светлый",
bogashevo => "Богашёво",
zonalnaya_stantsiya => "Зональная станция",
loskutovo => "Лоскутово",
moryakovskiy_zaton => "Моряковский Затон",
samus => "Самусь",
seversk => "Северск",
];
foreach($arr as $code => $name){
$this->cityinurl = $code;
$url = 'https://youla.ru/' . $this->cityinurl . '/nedvijimost/';
foreach ($categories as $category => $dealType) {
$page = 0;
$this->items = [];
$limit = $this->limit;
$this->city = $this->cityinurl;
$this->deal_type = $dealType;
$this->category = $costulDlyaCategories[$category];
while (TRUE) {
$page++;
$items = $this->getItems($url . $category . '?attributes%5Bsobstvennik_ili_agent%5D%5B0%5D=10705&attributes%5Bsort_field%5D=date_published&page=' . $page);
if (empty($items)) {
break;
}
foreach ($items as $item) {
if ($this->checkItem($item)) {
$limit = $this->limit;
$item->deal_type = $dealType;
$this->items[$item->id] = $item;
} else {
$limit--;
}
if (!$limit) {
break 2;
}
}
}
$this->__log("\n\n=========\n ITEMS FOUND: " . $category . ' - ' . count($this->items) . "\n=========\n\n");
foreach (array_reverse($this->items) as $item) {
if (!$this->getItem($item)) $this->saveUrl($item);
}
}
}
@unlink("{$_ENV[ROOT]}/../pids/{$this->site->code_alias}.pid");
}
public function getItems($url, $n = 0)
{
$items = array();
if($n){
$err = "getItems({$url}) - error receiving data";
$this->update_proxy(1, -1, $err);
if($n > $this->fails){
$this->__log($err);
return [];
}
}
$data = $this->get_url($url, FALSE);
$tmp = $data;
$tmp = preg_split('/
]+class="product_list[^>]+">/ismU',$tmp);
$tmp = preg_split('/<\/ul>/ismU',$tmp[1]);
if (!$tmp[0]) {
if(!preg_match("/alert_message__title/ismU", $data)){
return $this->getItems($url, $n + 1);
}else{
return $items;
}}
// if (!preg_match('/]+class="product_list[^\"]+_board_items">(.+)<\/ul>/ismU', $data, $blockMatch)) { // if(!preg_match("/alert_message__title/ismU", $data)){ // return $this->getItems($url, $n + 1); // }else{ // return $items; // } // } // $this->__log($tmp[0]);
//Get items
$rgxp = '- ]+class="product_item"[^>]+>((?!<\/li>).)*<\/li>';
if (!preg_match_all('{' . $rgxp . '}ism', $tmp[0], $matches)) {
return $items;
}
//Get data
$rgxp = '
- ]*>\s*cityinurl . '[^"]+)"\s+title="([^"]+)"\s*>\s*'
. '.+>([^><]+).+
\s*'
. '
([^<]+)<\/span>([^<]+)<';
foreach($matches[0] as $item_parsed){
if(preg_match('{' . $rgxp . '}ism', $item_parsed, $tmp)){
$items[$tmp[1]] = (object) array(
'id' => $this->getMappedId($tmp[1]),
'date' => $this->get_date($tmp[6], $tmp[5]),
'title' => trim($tmp[3]),
'url' => 'https://youla.ru' . $tmp[2],
'price' => preg_replace('/\D+/', '', trim(html_entity_decode(strip_tags($tmp[4])))),
);
}
}
// // if (!preg_match_all('{' . $rgxp . '}ism', $blockMatch[1], $matches)) {
// if (!preg_match_all('{' . $rgxp . '}ism', $tmp[0], $matches)) { // return $items; // }
// foreach (array_keys($matches[0]) as $key) {
// $items[$matches[1][$key]] = (object) array( // 'id' => $this->getMappedId($matches[1][$key]), // 'date' => $this->get_date($matches[6][$key], $matches[5][$key]), // 'title' => $matches[3][$key], // 'url' => 'https://youla.ru' . $matches[2][$key], // 'price' => preg_replace('/\D+/', '', trim(html_entity_decode(strip_tags($matches[4][$key])))),
// );
// }
return $items;
}
public function get_date($date, $time = "")
{
$time = preg_match('/(\d\d\:\d\d)/', $time, $match)
? $match[1]
: '00:00';
$dateObj = date_create_from_format('d.m.Y H:i', $date . ' ' . $time);
return $dateObj->format('Y-m-d H:i');
}
public function getItem($item, $n = 0)
{
$item->is_agency = 0;
if($n){
$err = "getItem({$item->id}) - error receiving data";
$this->update_proxy(1, -1, $err);
if($n > $this->fails) return $this->__log($err);
}
$data = $this->get_url($item->url, FALSE);
/* if (!preg_match("//ismU", $data, $tmp)) {
return $this->getItem($item, $n + 1);
}*/
if (!preg_match('{window.__YOULA_STATE__ =([^\n]+)}', $data, $tmp)) {
return $this->__log("getItem({$item->id}) - error parsing JSON");
}
$json = json_decode(trim(trim($tmp[1]), ';'), TRUE);
$jsonProduct = $json['entities']['products'][0];
$item->phone = $jsonProduct['owner']['displayPhoneNum'];
if (empty($item->phone)) {
$item->phone = "Нет телефона";
// $this->__log("error receiving phone");
// return;
}
else{
$tmpphone = preg_replace("/^7/sm","8",$item->phone);
mysql_query("UPDATE grabber.objects SET phone = '{$tmpphone}' WHERE url = '{$item->url}' AND phone = 'Нет телефона'");
mysql_query("UPDATE grabber.objects_links SET phone = REPLACE(phone,'23:\"Нет телефона','11:\"{$tmpphone}') WHERE url = '{$item->url}' AND phone LIKE '%Нет телефона%'");
}
// Nope from URL -> see categories in DB
$this->_getObjectType($item->url, $item);
$this->_getAgency($data, $item);
$item->seller = $jsonProduct['owner']['name'];
$item->title = $jsonProduct['name'];
$item->address = $jsonProduct['location']['description'];
$item->description = $jsonProduct['description'];
$item->display = 1;//$this->checkPhone($item->phone);
$item->distance = is_null($jsonProduct['distance']) ? '' : $jsonProduct['distance'];
$this->_getParams($jsonProduct['attributes'], $item);
$this->_getImages($jsonProduct['images'], $item);
$dt = new DateTime();
$dt->setTimezone(new DateTimeZone($jsonProduct['datePublished']['timezone']['name']));
$dt->setTimestamp($jsonProduct['datePublished']['timestamp']);
$item->date = date_create_from_format('d.m.Y H:i:s', $dt->format('d.m.Y H:i:s'))->format('Y-m-d H:i:s');
isset($this->olditems[$item->id])
? $this->updateItem($item)
: $this->saveItem($item);
return $item;
}
private function _getObjectType($data, $item)
{
if (preg_match('/(arenda|prodaja)\-doma/i', $data)) {
$item->category = 'doma_dachi_kottedzhi';
} elseif (preg_match('/prochie\-stroeniya/i', $data)) {
$item->category = 'kommercheskaya_nedvizhimost';
} elseif (preg_match('/(arenda|prodaja)\-komnati/i', $data)) {
$item->category = 'komnaty';
} elseif (preg_match('/(arenda|prodaja)\-kvartiri/i', $data)) {
$item->category = 'kvartiry';
} elseif (preg_match('/prodaja\-uchastka/i', $data)) {
$item->category = 'zemelnye_uchastki';
}
}
private function _getParams($data, $item)
{
$paramNames = array(
'realty_etaj' => 'Этаж',
'balkon' => 'Балкон',
'realty_infrastructure' => 'Инфраструктура',
'lift' => 'Лифт',
'realty_etajnost_doma' => 'Этажность',
'tip_doma' => 'Тип дома',
'sobstvennik_ili_agent' => 'Собственник или агент',
'komnat_v_kvartire' => 'Количество комнат',
'remont' => 'Ремонт',
'sanuzli' => 'Санузел',
'let_v_sobstvennosti' => 'Лет в собственности',
'realty_obshaya_ploshad' => 'Общая площадь',
'realty_ploshad_kuhni' => 'Площадь кухни',
);
$item->raion = '';
$item->land = 0.0;
$params = array();
foreach ($data as $attribute) {
switch ($attribute['slug']) {
case 'realty_etaj':
$item->level = $this->_getLevel($attribute['rawValue']);
break;
case 'realty_etajnost_doma':
$item->levels = is_numeric($attribute['rawValue'])
? $attribute['rawValue']
: 0;
break;
case 'tip_doma':
$item->material = empty($attribute['rawValue'])
? ''
: $attribute['rawValue'];
break;
case 'sobstvennik_ili_agent':
if ($attribute['rawValue'] != 'Собственник') {
$item->is_agency = 1;
}
break;
case 'komnat_v_kvartire':
$item->rooms = $this->_getRooms($attribute['rawValue']);
break;
case 'realty_obshaya_ploshad':
$item->s = $attribute['rawValue'] / 100;
break;
case 'realty_ploshad_kuhni':
$params[] = $paramNames['realty_ploshad_kuhni'] . ': ' . ($attribute['rawValue'] / 100);
break;
default:
$params[] = (
isset($paramNames[$attribute['slug']])
? $paramNames[$attribute['slug']]
: $attribute['slug']
) . ': ' . $attribute['rawValue'];
break;
}
}
$item->params = implode('; ', $params);
}
private function _getLevel($value)
{
if (is_numeric($value)) {
return $value;
} elseif (isset($this->level[$value])) {
return $this->level[$value];
}
return 0;
}
private function _getRooms($value)
{
$res = preg_replace('/\D+/', '', $value);
return is_numeric($res)
? $res
: 0;
}
private function _getAgency($data, $item)
{
if (preg_match('{(.+)}ismU', $data, $matches) &&
(trim($matches[1]) != 'Собственник')
) {
$item->is_agency = 1;
}
}
private function _getImages($data, $item)
{
$item->images = array();
foreach ($data as $image) {
$item->images[] = $image['url'];
}
$item->images = implode("\n", $item->images);
}
protected function error($err)
{
die($this->__log($err));
}
public function cron()
{
$logfile = "{$_ENV[ROOT]}/../logs/{$this->site->code_alias}.log";
$pidfile = "{$_ENV[ROOT]}/../pids/{$this->site->code_alias}.pid";
if (is_file($pidfile)) {
$pid = file_get_contents($pidfile);
shell_exec("kill -9 {$pid} > /dev/null 2>&1");
}
$cmd = "php {$_ENV[ROOT]}/grabber.php {$this->site->code_alias} > {$logfile} & echo $!";
// echo "[", date("Y-M-d H:i:s"), "] ", $cmd, "\n";
$pid = shell_exec($cmd);
file_put_contents($pidfile, $pid);
}
public function get_pkey($id, $t)
{
$t = preg_split("/[^0-9a-f]+/", $t);
$t = implode("", $id % 2
? $t
: array_reverse($t));
$t = str_split($t);
$r = '';
for ($i = 0; $i < count($t); ++$i) {
if ($i % 3 === 0) {
$r .= $t[$i];
}
}
return $r;
}
public function isValidItemHtml($html, $item)
{
return preg_match("/\bavito\.item\.phone\s*=\s*['\"]([^'\"]+)['\"]/ismU", $html);
}
public function get_proxy($proxy = "", $reset = 0)
{
$proxy = $this->get_proxylist();
$this->__proxy = (object) [ip => "", port => "", login => "", pass => ""];
// $this->proxy = "10.0.0.1:8800";
$this->proxy = $proxy[mt_rand(0, count($proxy) - 1)];
list($this->__proxy->ip, $this->__proxy->port, $this->__proxy->login, $this->__proxy->pass) = explode(":", $this->proxy);
$this->proxy = $this->__proxy->ip . ":" . $this->__proxy->port;
return $this->proxy;
}
protected function getMappedId($value)
{
if (preg_match('/^\d+$/', $value)) return $value;
$value = mysql_real_escape_string($value);
list($key) = mysql_fetch_array(mysql_query("SELECT `key` FROM objects_id_map WHERE site_id = '{$this->site_id}' AND `value` = '{$value}'"));
if ($key) return $key;
list($key) = mysql_fetch_array(mysql_query("SELECT MAX(`key`) + 1 FROM objects_id_map WHERE site_id = '{$this->site_id}'"));
if (!$key) $key = 1;
mysql_query("INSERT INTO objects_id_map(`site_id`, `key`, `value`) VALUES('{$this->site_id}', '{$key}', '{$value}')") or $this->error(mysql_error());
return $key;
}
}