import requests import socket import time import urllib.parse import random from bs4 import BeautifulSoup from fastapi.responses import JSONResponse async def scrape_url_improved(request): """ 改进版本的URL抓取函数,解决服务器部署后的网络连接问题 """ try: print(f"开始抓取URL: {request.url}") # 设置请求头,模拟浏览器访问 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Cache-Control': 'max-age=0' } # 添加DNS解析测试 try: url_parts = urllib.parse.urlparse(request.url) hostname = url_parts.hostname print(f"正在解析域名: {hostname}") ip_address = socket.gethostbyname(hostname) print(f"域名解析成功: {hostname} -> {ip_address}") except socket.gaierror as dns_error: print(f"DNS解析失败: {dns_error}") return JSONResponse( status_code=500, content={"error": f"DNS解析失败,无法访问 {hostname}: {str(dns_error)}"}, ) # 使用重试机制,增加连接和读取超时时间 max_retries = 3 timeout_settings = (30, 90) # (连接超时, 读取超时) response = None last_error = None for attempt in range(max_retries): try: print(f"第 {attempt + 1} 次尝试连接...") start_time = time.time() # 发送HTTP请求获取页面内容,增加超时时间和重试 response = requests.get( request.url, headers=headers, timeout=timeout_settings, verify=False, # 临时禁用SSL验证,避免证书问题 allow_redirects=True ) elapsed_time = time.time() - start_time print(f"请求成功,耗时: {elapsed_time:.2f}秒") response.raise_for_status() # 如果请求失败,抛出异常 break except requests.exceptions.Timeout as timeout_error: last_error = timeout_error print(f"第 {attempt + 1} 次尝试超时: {timeout_error}") if attempt < max_retries - 1: wait_time = (attempt + 1) * 2 # 递增等待时间 print(f"等待 {wait_time} 秒后重试...") time.sleep(wait_time) continue except requests.exceptions.ConnectionError as conn_error: last_error = conn_error print(f"第 {attempt + 1} 次尝试连接错误: {conn_error}") if attempt < max_retries - 1: wait_time = (attempt + 1) * 2 print(f"等待 {wait_time} 秒后重试...") time.sleep(wait_time) continue except requests.exceptions.RequestException as req_error: last_error = req_error print(f"第 {attempt + 1} 次尝试请求错误: {req_error}") if attempt < max_retries - 1: wait_time = (attempt + 1) * 2 print(f"等待 {wait_time} 秒后重试...") time.sleep(wait_time) continue # 如果所有重试都失败了 if response is None: error_msg = f"经过 {max_retries} 次重试后仍然无法连接到 {request.url}" if last_error: error_msg += f",最后错误: {str(last_error)}" print(error_msg) return JSONResponse( status_code=500, content={ "error": error_msg, "suggestions": [ "检查服务器网络连接", "确认目标网站是否可访问", "检查防火墙设置", "考虑配置代理服务器", "联系系统管理员检查网络配置" ] }, ) # 设置编码以正确处理中文字符 response.encoding = 'utf-8' # 解析HTML soup = BeautifulSoup(response.text, 'html.parser') # 获取基础URL用于解析相对路径 url_parts = urllib.parse.urlparse(request.url) base_url = f"{url_parts.scheme}://{url_parts.netloc}" # 初始化数据字典 teacher_data = { "id": f"BLG{random.randint(10000, 99999)}", "photo": f"data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='100' height='120' viewBox='0 0 100 120'%3E%3Crect width='100' height='120' fill='%234986ff' opacity='0.3'/%3E%3Ccircle cx='50' cy='45' r='25' fill='%234986ff' opacity='0.6'/%3E%3Ccircle cx='50' cy='95' r='35' fill='%234986ff' opacity='0.6'/%3E%3C/svg%3E", "evaluationData": [ round(min(100, max(60, 70 + 20 * (0.5 - random.random())))) for _ in range(6) ] } # 从教师信息表提取基本信息 info_table = soup.find('div', class_='wz_teacher') if info_table: table = info_table.find('table') if table: rows = table.find_all('tr') # 提取姓名、性别、出生年月 if len(rows) > 0: cells = rows[0].find_all('td') if len(cells) >= 6: teacher_data["name"] = cells[1].text.strip() teacher_data["gender"] = cells[3].text.strip() teacher_data["birthDate"] = cells[5].text.strip() # 提取职称、职务、最高学历 if len(rows) > 1: cells = rows[1].find_all('td') if len(cells) >= 6: teacher_data["title"] = cells[1].text.strip() position = cells[3].text.strip() teacher_data["position"] = position if position else "" teacher_data["education"] = cells[5].text.strip() # 提取学科方向 if len(rows) > 2: cells = rows[2].find_all('td') if len(cells) >= 2: teacher_data["academicDirection"] = cells[1].text.strip() # 提取人才计划和办公地点 if len(rows) > 3: cells = rows[3].find_all('td') if len(cells) >= 6: talent_plan = cells[1].text.strip() teacher_data["talentPlan"] = talent_plan if talent_plan else "" teacher_data["officeLocation"] = cells[5].text.strip() # 提取电子邮件和联系方式 if len(rows) > 4: cells = rows[4].find_all('td') if len(cells) >= 6: email = cells[1].text.strip() teacher_data["email"] = email if email else "" phone = cells[5].text.strip() teacher_data["phone"] = phone if phone else "" # 提取通讯地址 if len(rows) > 5: cells = rows[5].find_all('td') if len(cells) >= 2: teacher_data["address"] = cells[1].text.strip() # 提取导师类型 if len(rows) > 6: cells = rows[6].find_all('td') if len(cells) >= 2: teacher_data["tutorType"] = cells[1].text.strip() # 提取照片 photo_element = soup.select_one('.teacherInfo .img img') if photo_element and photo_element.get('src'): img_src = photo_element['src'] # 处理相对路径,构建完整的图片URL if img_src.startswith('../../../'): # 从URL获取基础路径(移除文件名和最后两级目录) url_parts = request.url.split('/') if len(url_parts) >= 4: base_path = '/'.join(url_parts[:-3]) img_url = f"{base_path}/{img_src[9:]}" # 移除 '../../../' else: img_url = urllib.parse.urljoin(base_url, img_src) else: img_url = urllib.parse.urljoin(base_url, img_src) # 直接保存完整的图片URL,不下载到本地 teacher_data["photo"] = img_url # 提取详细信息部分 content_divs = soup.select('.con01_t') for div in content_divs: heading = div.find('h3') if not heading: continue heading_text = heading.text.strip() # 获取该部分的所有段落文本 paragraphs = [p.text.strip() for p in div.find_all('p') if p.text.strip()] section_content = '\n'.join(paragraphs) # 根据标题将内容映射到相应字段 if '教育与工作经历' in heading_text: teacher_data["eduWorkHistory"] = section_content elif '研究方向' in heading_text: teacher_data["researchDirection"] = section_content elif '近5年承担的科研项目' in heading_text or '近五年承担的科研项目' in heading_text: teacher_data["recentProjects"] = section_content # 计算项目数量 project_count = len([p for p in paragraphs if p.strip().startswith(str(len(paragraphs) - paragraphs.index(p))+".")]) if project_count > 0: teacher_data["projects"] = f"{project_count}项" else: teacher_data["projects"] = f"{len(paragraphs)}项" elif '代表性学术论文' in heading_text: teacher_data["representativePapers"] = section_content # 计算论文数量 paper_count = len([p for p in paragraphs if p.strip().startswith("[")]) if paper_count > 0: teacher_data["papers"] = f"{paper_count}篇" else: teacher_data["papers"] = f"{len(paragraphs)}篇" elif '授权国家发明专利' in heading_text or '专利' in heading_text: teacher_data["patents"] = section_content print(f"抓取成功,提取到教师数据: {teacher_data.get('name', '未知')}") return teacher_data except Exception as e: print(f"抓取错误: {str(e)}") return JSONResponse( status_code=500, content={"error": f"抓取网页失败: {str(e)}"}, )