update_status_lianjia.py
# -*- coding: utf-8 -*-
import re
from bs4 import BeautifulSoup as bs
import datetime
import time
import sys
import os
import pymongo
import xlsxwriter
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import WebDriverException
from urllib3.exceptions import ProtocolError
from lxml import etree
client = pymongo.MongoClient("192.168.3.88", 27017)
db = client.fang.get_collection('lianjia_2sf_info_new')
now_month = datetime.datetime.now().strftime("%Y-%m")
now = datetime.datetime.now().strftime("%Y-%m-%d")
def gethtml(inurl, url_id):
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument('user-agent={user_agent}')
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(options=chrome_options)
driver.set_page_load_timeout(60)
driver.set_script_timeout(60)
try:
driver.get(inurl)
except TimeoutException:
os.system("echo %s >> %s_update_error_urls.txt" % (inurl,now_month))
html = None
except WebDriverException:
os.system("echo %s >> %s_update_error_urls.txt" % (inurl,now_month))
html = None
except ConnectionResetError:
os.system("echo %s >> %s_update_error_urls.txt" % (inurl,now_month))
html = None
except ProtocolError:
os.system("echo %s >> %s_update_error_urls.txt" % (inurl,now_month))
html = None
else:
html = driver.page_source
driver.quit()
if html:
html = etree.HTML(html)
q_sale_status = html.xpath('/html/body/div[3]/div/div/div[1]/h1/span/text()') # 是否在售的状态,如果为空则在售
q_sale_status = ''.join(q_sale_status)
q_sale_all = html.xpath('/html/body/div[5]/div[2]/div[3]/*//text()')[0:2] # 总价
q_sale_all = ''.join(q_sale_all)
q_sale_m = html.xpath('/html/body/div[5]/div[2]/div[3]/div[1]/div[1]/*//text()') # 每平单价
q_sale_m = ''.join(q_sale_m)
if len(q_sale_status) == 0:
db.update({"id_":url_id} ,{"$set": {"状态":"在售", "总售价":q_sale_all, "每平单价":q_sale_m, "更新时间":now}})
else:
db.update({"id_":url_id} ,{"$set": {"状态":q_sale_status.strip(), "更新时间":now}})
else:
pass
# esf_info = db.find({"$or": [{'更新时间': None}, {'状态': '在售'}, {'状态': '已下架'}]}).batch_size(10).skip(78087)
esf_info = db.find({"$or": [{'更新时间': None}, {'状态': '在售'}, {'状态': '已下架'}]}).batch_size(10)
count = 1
for info in esf_info:
inurl = info['页面链接']
print(count, inurl)
count += 1
url_id = inurl.split('/')[-1].split('.')[0]
gethtml(inurl, url_id)
db.update({"总售价": ""}, {"$set": {"状态": "已出售"}}, multi=True)最后更新于