dashboard/backend/scrape_url_improved.py
2025-06-09 14:59:40 +08:00

256 lines
11 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
import socket
import time
import urllib.parse
import random
from bs4 import BeautifulSoup
from fastapi.responses import JSONResponse
async def scrape_url_improved(request):
"""
改进版本的URL抓取函数解决服务器部署后的网络连接问题
"""
try:
print(f"开始抓取URL: {request.url}")
# 设置请求头,模拟浏览器访问
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'max-age=0'
}
# 添加DNS解析测试
try:
url_parts = urllib.parse.urlparse(request.url)
hostname = url_parts.hostname
print(f"正在解析域名: {hostname}")
ip_address = socket.gethostbyname(hostname)
print(f"域名解析成功: {hostname} -> {ip_address}")
except socket.gaierror as dns_error:
print(f"DNS解析失败: {dns_error}")
return JSONResponse(
status_code=500,
content={"error": f"DNS解析失败无法访问 {hostname}: {str(dns_error)}"},
)
# 使用重试机制,增加连接和读取超时时间
max_retries = 3
timeout_settings = (30, 90) # (连接超时, 读取超时)
response = None
last_error = None
for attempt in range(max_retries):
try:
print(f"{attempt + 1} 次尝试连接...")
start_time = time.time()
# 发送HTTP请求获取页面内容增加超时时间和重试
response = requests.get(
request.url,
headers=headers,
timeout=timeout_settings,
verify=False, # 临时禁用SSL验证避免证书问题
allow_redirects=True
)
elapsed_time = time.time() - start_time
print(f"请求成功,耗时: {elapsed_time:.2f}")
response.raise_for_status() # 如果请求失败,抛出异常
break
except requests.exceptions.Timeout as timeout_error:
last_error = timeout_error
print(f"{attempt + 1} 次尝试超时: {timeout_error}")
if attempt < max_retries - 1:
wait_time = (attempt + 1) * 2 # 递增等待时间
print(f"等待 {wait_time} 秒后重试...")
time.sleep(wait_time)
continue
except requests.exceptions.ConnectionError as conn_error:
last_error = conn_error
print(f"{attempt + 1} 次尝试连接错误: {conn_error}")
if attempt < max_retries - 1:
wait_time = (attempt + 1) * 2
print(f"等待 {wait_time} 秒后重试...")
time.sleep(wait_time)
continue
except requests.exceptions.RequestException as req_error:
last_error = req_error
print(f"{attempt + 1} 次尝试请求错误: {req_error}")
if attempt < max_retries - 1:
wait_time = (attempt + 1) * 2
print(f"等待 {wait_time} 秒后重试...")
time.sleep(wait_time)
continue
# 如果所有重试都失败了
if response is None:
error_msg = f"经过 {max_retries} 次重试后仍然无法连接到 {request.url}"
if last_error:
error_msg += f",最后错误: {str(last_error)}"
print(error_msg)
return JSONResponse(
status_code=500,
content={
"error": error_msg,
"suggestions": [
"检查服务器网络连接",
"确认目标网站是否可访问",
"检查防火墙设置",
"考虑配置代理服务器",
"联系系统管理员检查网络配置"
]
},
)
# 设置编码以正确处理中文字符
response.encoding = 'utf-8'
# 解析HTML
soup = BeautifulSoup(response.text, 'html.parser')
# 获取基础URL用于解析相对路径
url_parts = urllib.parse.urlparse(request.url)
base_url = f"{url_parts.scheme}://{url_parts.netloc}"
# 初始化数据字典
teacher_data = {
"id": f"BLG{random.randint(10000, 99999)}",
"photo": f"data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='100' height='120' viewBox='0 0 100 120'%3E%3Crect width='100' height='120' fill='%234986ff' opacity='0.3'/%3E%3Ccircle cx='50' cy='45' r='25' fill='%234986ff' opacity='0.6'/%3E%3Ccircle cx='50' cy='95' r='35' fill='%234986ff' opacity='0.6'/%3E%3C/svg%3E",
"evaluationData": [
round(min(100, max(60, 70 + 20 * (0.5 - random.random())))) for _ in range(6)
]
}
# 从教师信息表提取基本信息
info_table = soup.find('div', class_='wz_teacher')
if info_table:
table = info_table.find('table')
if table:
rows = table.find_all('tr')
# 提取姓名、性别、出生年月
if len(rows) > 0:
cells = rows[0].find_all('td')
if len(cells) >= 6:
teacher_data["name"] = cells[1].text.strip()
teacher_data["gender"] = cells[3].text.strip()
teacher_data["birthDate"] = cells[5].text.strip()
# 提取职称、职务、最高学历
if len(rows) > 1:
cells = rows[1].find_all('td')
if len(cells) >= 6:
teacher_data["title"] = cells[1].text.strip()
position = cells[3].text.strip()
teacher_data["position"] = position if position else ""
teacher_data["education"] = cells[5].text.strip()
# 提取学科方向
if len(rows) > 2:
cells = rows[2].find_all('td')
if len(cells) >= 2:
teacher_data["academicDirection"] = cells[1].text.strip()
# 提取人才计划和办公地点
if len(rows) > 3:
cells = rows[3].find_all('td')
if len(cells) >= 6:
talent_plan = cells[1].text.strip()
teacher_data["talentPlan"] = talent_plan if talent_plan else ""
teacher_data["officeLocation"] = cells[5].text.strip()
# 提取电子邮件和联系方式
if len(rows) > 4:
cells = rows[4].find_all('td')
if len(cells) >= 6:
email = cells[1].text.strip()
teacher_data["email"] = email if email else ""
phone = cells[5].text.strip()
teacher_data["phone"] = phone if phone else ""
# 提取通讯地址
if len(rows) > 5:
cells = rows[5].find_all('td')
if len(cells) >= 2:
teacher_data["address"] = cells[1].text.strip()
# 提取导师类型
if len(rows) > 6:
cells = rows[6].find_all('td')
if len(cells) >= 2:
teacher_data["tutorType"] = cells[1].text.strip()
# 提取照片
photo_element = soup.select_one('.teacherInfo .img img')
if photo_element and photo_element.get('src'):
img_src = photo_element['src']
# 处理相对路径构建完整的图片URL
if img_src.startswith('../../../'):
# 从URL获取基础路径移除文件名和最后两级目录
url_parts = request.url.split('/')
if len(url_parts) >= 4:
base_path = '/'.join(url_parts[:-3])
img_url = f"{base_path}/{img_src[9:]}" # 移除 '../../../'
else:
img_url = urllib.parse.urljoin(base_url, img_src)
else:
img_url = urllib.parse.urljoin(base_url, img_src)
# 直接保存完整的图片URL不下载到本地
teacher_data["photo"] = img_url
# 提取详细信息部分
content_divs = soup.select('.con01_t')
for div in content_divs:
heading = div.find('h3')
if not heading:
continue
heading_text = heading.text.strip()
# 获取该部分的所有段落文本
paragraphs = [p.text.strip() for p in div.find_all('p') if p.text.strip()]
section_content = '\n'.join(paragraphs)
# 根据标题将内容映射到相应字段
if '教育与工作经历' in heading_text:
teacher_data["eduWorkHistory"] = section_content
elif '研究方向' in heading_text:
teacher_data["researchDirection"] = section_content
elif '近5年承担的科研项目' in heading_text or '近五年承担的科研项目' in heading_text:
teacher_data["recentProjects"] = section_content
# 计算项目数量
project_count = len([p for p in paragraphs if p.strip().startswith(str(len(paragraphs) - paragraphs.index(p))+".")])
if project_count > 0:
teacher_data["projects"] = f"{project_count}"
else:
teacher_data["projects"] = f"{len(paragraphs)}"
elif '代表性学术论文' in heading_text:
teacher_data["representativePapers"] = section_content
# 计算论文数量
paper_count = len([p for p in paragraphs if p.strip().startswith("[")])
if paper_count > 0:
teacher_data["papers"] = f"{paper_count}"
else:
teacher_data["papers"] = f"{len(paragraphs)}"
elif '授权国家发明专利' in heading_text or '专利' in heading_text:
teacher_data["patents"] = section_content
print(f"抓取成功,提取到教师数据: {teacher_data.get('name', '未知')}")
return teacher_data
except Exception as e:
print(f"抓取错误: {str(e)}")
return JSONResponse(
status_code=500,
content={"error": f"抓取网页失败: {str(e)}"},
)