dashboard/backend/scrape_url_improved.py

256 lines
11 KiB
Python
Raw Normal View History

2025-06-09 14:59:40 +08:00
import requests
import socket
import time
import urllib.parse
import random
from bs4 import BeautifulSoup
from fastapi.responses import JSONResponse
async def scrape_url_improved(request):
"""
改进版本的URL抓取函数解决服务器部署后的网络连接问题
"""
try:
print(f"开始抓取URL: {request.url}")
# 设置请求头,模拟浏览器访问
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'max-age=0'
}
# 添加DNS解析测试
try:
url_parts = urllib.parse.urlparse(request.url)
hostname = url_parts.hostname
print(f"正在解析域名: {hostname}")
ip_address = socket.gethostbyname(hostname)
print(f"域名解析成功: {hostname} -> {ip_address}")
except socket.gaierror as dns_error:
print(f"DNS解析失败: {dns_error}")
return JSONResponse(
status_code=500,
content={"error": f"DNS解析失败无法访问 {hostname}: {str(dns_error)}"},
)
# 使用重试机制,增加连接和读取超时时间
max_retries = 3
timeout_settings = (30, 90) # (连接超时, 读取超时)
response = None
last_error = None
for attempt in range(max_retries):
try:
print(f"{attempt + 1} 次尝试连接...")
start_time = time.time()
# 发送HTTP请求获取页面内容增加超时时间和重试
response = requests.get(
request.url,
headers=headers,
timeout=timeout_settings,
verify=False, # 临时禁用SSL验证避免证书问题
allow_redirects=True
)
elapsed_time = time.time() - start_time
print(f"请求成功,耗时: {elapsed_time:.2f}")
response.raise_for_status() # 如果请求失败,抛出异常
break
except requests.exceptions.Timeout as timeout_error:
last_error = timeout_error
print(f"{attempt + 1} 次尝试超时: {timeout_error}")
if attempt < max_retries - 1:
wait_time = (attempt + 1) * 2 # 递增等待时间
print(f"等待 {wait_time} 秒后重试...")
time.sleep(wait_time)
continue
except requests.exceptions.ConnectionError as conn_error:
last_error = conn_error
print(f"{attempt + 1} 次尝试连接错误: {conn_error}")
if attempt < max_retries - 1:
wait_time = (attempt + 1) * 2
print(f"等待 {wait_time} 秒后重试...")
time.sleep(wait_time)
continue
except requests.exceptions.RequestException as req_error:
last_error = req_error
print(f"{attempt + 1} 次尝试请求错误: {req_error}")
if attempt < max_retries - 1:
wait_time = (attempt + 1) * 2
print(f"等待 {wait_time} 秒后重试...")
time.sleep(wait_time)
continue
# 如果所有重试都失败了
if response is None:
error_msg = f"经过 {max_retries} 次重试后仍然无法连接到 {request.url}"
if last_error:
error_msg += f",最后错误: {str(last_error)}"
print(error_msg)
return JSONResponse(
status_code=500,
content={
"error": error_msg,
"suggestions": [
"检查服务器网络连接",
"确认目标网站是否可访问",
"检查防火墙设置",
"考虑配置代理服务器",
"联系系统管理员检查网络配置"
]
},
)
# 设置编码以正确处理中文字符
response.encoding = 'utf-8'
# 解析HTML
soup = BeautifulSoup(response.text, 'html.parser')
# 获取基础URL用于解析相对路径
url_parts = urllib.parse.urlparse(request.url)
base_url = f"{url_parts.scheme}://{url_parts.netloc}"
# 初始化数据字典
teacher_data = {
"id": f"BLG{random.randint(10000, 99999)}",
"photo": f"data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='100' height='120' viewBox='0 0 100 120'%3E%3Crect width='100' height='120' fill='%234986ff' opacity='0.3'/%3E%3Ccircle cx='50' cy='45' r='25' fill='%234986ff' opacity='0.6'/%3E%3Ccircle cx='50' cy='95' r='35' fill='%234986ff' opacity='0.6'/%3E%3C/svg%3E",
"evaluationData": [
round(min(100, max(60, 70 + 20 * (0.5 - random.random())))) for _ in range(6)
]
}
# 从教师信息表提取基本信息
info_table = soup.find('div', class_='wz_teacher')
if info_table:
table = info_table.find('table')
if table:
rows = table.find_all('tr')
# 提取姓名、性别、出生年月
if len(rows) > 0:
cells = rows[0].find_all('td')
if len(cells) >= 6:
teacher_data["name"] = cells[1].text.strip()
teacher_data["gender"] = cells[3].text.strip()
teacher_data["birthDate"] = cells[5].text.strip()
# 提取职称、职务、最高学历
if len(rows) > 1:
cells = rows[1].find_all('td')
if len(cells) >= 6:
teacher_data["title"] = cells[1].text.strip()
position = cells[3].text.strip()
teacher_data["position"] = position if position else ""
teacher_data["education"] = cells[5].text.strip()
# 提取学科方向
if len(rows) > 2:
cells = rows[2].find_all('td')
if len(cells) >= 2:
teacher_data["academicDirection"] = cells[1].text.strip()
# 提取人才计划和办公地点
if len(rows) > 3:
cells = rows[3].find_all('td')
if len(cells) >= 6:
talent_plan = cells[1].text.strip()
teacher_data["talentPlan"] = talent_plan if talent_plan else ""
teacher_data["officeLocation"] = cells[5].text.strip()
# 提取电子邮件和联系方式
if len(rows) > 4:
cells = rows[4].find_all('td')
if len(cells) >= 6:
email = cells[1].text.strip()
teacher_data["email"] = email if email else ""
phone = cells[5].text.strip()
teacher_data["phone"] = phone if phone else ""
# 提取通讯地址
if len(rows) > 5:
cells = rows[5].find_all('td')
if len(cells) >= 2:
teacher_data["address"] = cells[1].text.strip()
# 提取导师类型
if len(rows) > 6:
cells = rows[6].find_all('td')
if len(cells) >= 2:
teacher_data["tutorType"] = cells[1].text.strip()
# 提取照片
photo_element = soup.select_one('.teacherInfo .img img')
if photo_element and photo_element.get('src'):
img_src = photo_element['src']
# 处理相对路径构建完整的图片URL
if img_src.startswith('../../../'):
# 从URL获取基础路径移除文件名和最后两级目录
url_parts = request.url.split('/')
if len(url_parts) >= 4:
base_path = '/'.join(url_parts[:-3])
img_url = f"{base_path}/{img_src[9:]}" # 移除 '../../../'
else:
img_url = urllib.parse.urljoin(base_url, img_src)
else:
img_url = urllib.parse.urljoin(base_url, img_src)
# 直接保存完整的图片URL不下载到本地
teacher_data["photo"] = img_url
# 提取详细信息部分
content_divs = soup.select('.con01_t')
for div in content_divs:
heading = div.find('h3')
if not heading:
continue
heading_text = heading.text.strip()
# 获取该部分的所有段落文本
paragraphs = [p.text.strip() for p in div.find_all('p') if p.text.strip()]
section_content = '\n'.join(paragraphs)
# 根据标题将内容映射到相应字段
if '教育与工作经历' in heading_text:
teacher_data["eduWorkHistory"] = section_content
elif '研究方向' in heading_text:
teacher_data["researchDirection"] = section_content
elif '近5年承担的科研项目' in heading_text or '近五年承担的科研项目' in heading_text:
teacher_data["recentProjects"] = section_content
# 计算项目数量
project_count = len([p for p in paragraphs if p.strip().startswith(str(len(paragraphs) - paragraphs.index(p))+".")])
if project_count > 0:
teacher_data["projects"] = f"{project_count}"
else:
teacher_data["projects"] = f"{len(paragraphs)}"
elif '代表性学术论文' in heading_text:
teacher_data["representativePapers"] = section_content
# 计算论文数量
paper_count = len([p for p in paragraphs if p.strip().startswith("[")])
if paper_count > 0:
teacher_data["papers"] = f"{paper_count}"
else:
teacher_data["papers"] = f"{len(paragraphs)}"
elif '授权国家发明专利' in heading_text or '专利' in heading_text:
teacher_data["patents"] = section_content
print(f"抓取成功,提取到教师数据: {teacher_data.get('name', '未知')}")
return teacher_data
except Exception as e:
print(f"抓取错误: {str(e)}")
return JSONResponse(
status_code=500,
content={"error": f"抓取网页失败: {str(e)}"},
)