256 lines
11 KiB
Python
256 lines
11 KiB
Python
import requests
|
||
import socket
|
||
import time
|
||
import urllib.parse
|
||
import random
|
||
from bs4 import BeautifulSoup
|
||
from fastapi.responses import JSONResponse
|
||
|
||
async def scrape_url_improved(request):
|
||
"""
|
||
改进版本的URL抓取函数,解决服务器部署后的网络连接问题
|
||
"""
|
||
try:
|
||
print(f"开始抓取URL: {request.url}")
|
||
|
||
# 设置请求头,模拟浏览器访问
|
||
headers = {
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||
'Connection': 'keep-alive',
|
||
'Upgrade-Insecure-Requests': '1',
|
||
'Cache-Control': 'max-age=0'
|
||
}
|
||
|
||
# 添加DNS解析测试
|
||
try:
|
||
url_parts = urllib.parse.urlparse(request.url)
|
||
hostname = url_parts.hostname
|
||
print(f"正在解析域名: {hostname}")
|
||
ip_address = socket.gethostbyname(hostname)
|
||
print(f"域名解析成功: {hostname} -> {ip_address}")
|
||
except socket.gaierror as dns_error:
|
||
print(f"DNS解析失败: {dns_error}")
|
||
return JSONResponse(
|
||
status_code=500,
|
||
content={"error": f"DNS解析失败,无法访问 {hostname}: {str(dns_error)}"},
|
||
)
|
||
|
||
# 使用重试机制,增加连接和读取超时时间
|
||
max_retries = 3
|
||
timeout_settings = (30, 90) # (连接超时, 读取超时)
|
||
|
||
response = None
|
||
last_error = None
|
||
|
||
for attempt in range(max_retries):
|
||
try:
|
||
print(f"第 {attempt + 1} 次尝试连接...")
|
||
start_time = time.time()
|
||
|
||
# 发送HTTP请求获取页面内容,增加超时时间和重试
|
||
response = requests.get(
|
||
request.url,
|
||
headers=headers,
|
||
timeout=timeout_settings,
|
||
verify=False, # 临时禁用SSL验证,避免证书问题
|
||
allow_redirects=True
|
||
)
|
||
|
||
elapsed_time = time.time() - start_time
|
||
print(f"请求成功,耗时: {elapsed_time:.2f}秒")
|
||
response.raise_for_status() # 如果请求失败,抛出异常
|
||
break
|
||
|
||
except requests.exceptions.Timeout as timeout_error:
|
||
last_error = timeout_error
|
||
print(f"第 {attempt + 1} 次尝试超时: {timeout_error}")
|
||
if attempt < max_retries - 1:
|
||
wait_time = (attempt + 1) * 2 # 递增等待时间
|
||
print(f"等待 {wait_time} 秒后重试...")
|
||
time.sleep(wait_time)
|
||
continue
|
||
|
||
except requests.exceptions.ConnectionError as conn_error:
|
||
last_error = conn_error
|
||
print(f"第 {attempt + 1} 次尝试连接错误: {conn_error}")
|
||
if attempt < max_retries - 1:
|
||
wait_time = (attempt + 1) * 2
|
||
print(f"等待 {wait_time} 秒后重试...")
|
||
time.sleep(wait_time)
|
||
continue
|
||
|
||
except requests.exceptions.RequestException as req_error:
|
||
last_error = req_error
|
||
print(f"第 {attempt + 1} 次尝试请求错误: {req_error}")
|
||
if attempt < max_retries - 1:
|
||
wait_time = (attempt + 1) * 2
|
||
print(f"等待 {wait_time} 秒后重试...")
|
||
time.sleep(wait_time)
|
||
continue
|
||
|
||
# 如果所有重试都失败了
|
||
if response is None:
|
||
error_msg = f"经过 {max_retries} 次重试后仍然无法连接到 {request.url}"
|
||
if last_error:
|
||
error_msg += f",最后错误: {str(last_error)}"
|
||
print(error_msg)
|
||
return JSONResponse(
|
||
status_code=500,
|
||
content={
|
||
"error": error_msg,
|
||
"suggestions": [
|
||
"检查服务器网络连接",
|
||
"确认目标网站是否可访问",
|
||
"检查防火墙设置",
|
||
"考虑配置代理服务器",
|
||
"联系系统管理员检查网络配置"
|
||
]
|
||
},
|
||
)
|
||
|
||
# 设置编码以正确处理中文字符
|
||
response.encoding = 'utf-8'
|
||
|
||
# 解析HTML
|
||
soup = BeautifulSoup(response.text, 'html.parser')
|
||
|
||
# 获取基础URL用于解析相对路径
|
||
url_parts = urllib.parse.urlparse(request.url)
|
||
base_url = f"{url_parts.scheme}://{url_parts.netloc}"
|
||
|
||
# 初始化数据字典
|
||
teacher_data = {
|
||
"id": f"BLG{random.randint(10000, 99999)}",
|
||
"photo": f"data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='100' height='120' viewBox='0 0 100 120'%3E%3Crect width='100' height='120' fill='%234986ff' opacity='0.3'/%3E%3Ccircle cx='50' cy='45' r='25' fill='%234986ff' opacity='0.6'/%3E%3Ccircle cx='50' cy='95' r='35' fill='%234986ff' opacity='0.6'/%3E%3C/svg%3E",
|
||
"evaluationData": [
|
||
round(min(100, max(60, 70 + 20 * (0.5 - random.random())))) for _ in range(6)
|
||
]
|
||
}
|
||
|
||
# 从教师信息表提取基本信息
|
||
info_table = soup.find('div', class_='wz_teacher')
|
||
if info_table:
|
||
table = info_table.find('table')
|
||
if table:
|
||
rows = table.find_all('tr')
|
||
|
||
# 提取姓名、性别、出生年月
|
||
if len(rows) > 0:
|
||
cells = rows[0].find_all('td')
|
||
if len(cells) >= 6:
|
||
teacher_data["name"] = cells[1].text.strip()
|
||
teacher_data["gender"] = cells[3].text.strip()
|
||
teacher_data["birthDate"] = cells[5].text.strip()
|
||
|
||
# 提取职称、职务、最高学历
|
||
if len(rows) > 1:
|
||
cells = rows[1].find_all('td')
|
||
if len(cells) >= 6:
|
||
teacher_data["title"] = cells[1].text.strip()
|
||
position = cells[3].text.strip()
|
||
teacher_data["position"] = position if position else ""
|
||
teacher_data["education"] = cells[5].text.strip()
|
||
|
||
# 提取学科方向
|
||
if len(rows) > 2:
|
||
cells = rows[2].find_all('td')
|
||
if len(cells) >= 2:
|
||
teacher_data["academicDirection"] = cells[1].text.strip()
|
||
|
||
# 提取人才计划和办公地点
|
||
if len(rows) > 3:
|
||
cells = rows[3].find_all('td')
|
||
if len(cells) >= 6:
|
||
talent_plan = cells[1].text.strip()
|
||
teacher_data["talentPlan"] = talent_plan if talent_plan else ""
|
||
teacher_data["officeLocation"] = cells[5].text.strip()
|
||
|
||
# 提取电子邮件和联系方式
|
||
if len(rows) > 4:
|
||
cells = rows[4].find_all('td')
|
||
if len(cells) >= 6:
|
||
email = cells[1].text.strip()
|
||
teacher_data["email"] = email if email else ""
|
||
phone = cells[5].text.strip()
|
||
teacher_data["phone"] = phone if phone else ""
|
||
|
||
# 提取通讯地址
|
||
if len(rows) > 5:
|
||
cells = rows[5].find_all('td')
|
||
if len(cells) >= 2:
|
||
teacher_data["address"] = cells[1].text.strip()
|
||
|
||
# 提取导师类型
|
||
if len(rows) > 6:
|
||
cells = rows[6].find_all('td')
|
||
if len(cells) >= 2:
|
||
teacher_data["tutorType"] = cells[1].text.strip()
|
||
|
||
# 提取照片
|
||
photo_element = soup.select_one('.teacherInfo .img img')
|
||
if photo_element and photo_element.get('src'):
|
||
img_src = photo_element['src']
|
||
|
||
# 处理相对路径,构建完整的图片URL
|
||
if img_src.startswith('../../../'):
|
||
# 从URL获取基础路径(移除文件名和最后两级目录)
|
||
url_parts = request.url.split('/')
|
||
if len(url_parts) >= 4:
|
||
base_path = '/'.join(url_parts[:-3])
|
||
img_url = f"{base_path}/{img_src[9:]}" # 移除 '../../../'
|
||
else:
|
||
img_url = urllib.parse.urljoin(base_url, img_src)
|
||
else:
|
||
img_url = urllib.parse.urljoin(base_url, img_src)
|
||
|
||
# 直接保存完整的图片URL,不下载到本地
|
||
teacher_data["photo"] = img_url
|
||
|
||
# 提取详细信息部分
|
||
content_divs = soup.select('.con01_t')
|
||
for div in content_divs:
|
||
heading = div.find('h3')
|
||
if not heading:
|
||
continue
|
||
|
||
heading_text = heading.text.strip()
|
||
|
||
# 获取该部分的所有段落文本
|
||
paragraphs = [p.text.strip() for p in div.find_all('p') if p.text.strip()]
|
||
section_content = '\n'.join(paragraphs)
|
||
|
||
# 根据标题将内容映射到相应字段
|
||
if '教育与工作经历' in heading_text:
|
||
teacher_data["eduWorkHistory"] = section_content
|
||
elif '研究方向' in heading_text:
|
||
teacher_data["researchDirection"] = section_content
|
||
elif '近5年承担的科研项目' in heading_text or '近五年承担的科研项目' in heading_text:
|
||
teacher_data["recentProjects"] = section_content
|
||
# 计算项目数量
|
||
project_count = len([p for p in paragraphs if p.strip().startswith(str(len(paragraphs) - paragraphs.index(p))+".")])
|
||
if project_count > 0:
|
||
teacher_data["projects"] = f"{project_count}项"
|
||
else:
|
||
teacher_data["projects"] = f"{len(paragraphs)}项"
|
||
elif '代表性学术论文' in heading_text:
|
||
teacher_data["representativePapers"] = section_content
|
||
# 计算论文数量
|
||
paper_count = len([p for p in paragraphs if p.strip().startswith("[")])
|
||
if paper_count > 0:
|
||
teacher_data["papers"] = f"{paper_count}篇"
|
||
else:
|
||
teacher_data["papers"] = f"{len(paragraphs)}篇"
|
||
elif '授权国家发明专利' in heading_text or '专利' in heading_text:
|
||
teacher_data["patents"] = section_content
|
||
|
||
print(f"抓取成功,提取到教师数据: {teacher_data.get('name', '未知')}")
|
||
return teacher_data
|
||
|
||
except Exception as e:
|
||
print(f"抓取错误: {str(e)}")
|
||
return JSONResponse(
|
||
status_code=500,
|
||
content={"error": f"抓取网页失败: {str(e)}"},
|
||
) |