dashboard/backend/scrape_url_improved.py

import requests
import socket
import time
import urllib.parse
import random
from bs4 import BeautifulSoup
from fastapi.responses import JSONResponse

async def scrape_url_improved(request):
    """
    改进版本的URL抓取函数，解决服务器部署后的网络连接问题
    """
    try:
        print(f"开始抓取URL: {request.url}")
        
        # 设置请求头，模拟浏览器访问
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'Cache-Control': 'max-age=0'
        }
        
        # 添加DNS解析测试
        try:
            url_parts = urllib.parse.urlparse(request.url)
            hostname = url_parts.hostname
            print(f"正在解析域名: {hostname}")
            ip_address = socket.gethostbyname(hostname)
            print(f"域名解析成功: {hostname} -> {ip_address}")
        except socket.gaierror as dns_error:
            print(f"DNS解析失败: {dns_error}")
            return JSONResponse(
                status_code=500,
                content={"error": f"DNS解析失败，无法访问 {hostname}: {str(dns_error)}"},
            )
        
        # 使用重试机制，增加连接和读取超时时间
        max_retries = 3
        timeout_settings = (30, 90)  # (连接超时, 读取超时)
        
        response = None
        last_error = None
        
        for attempt in range(max_retries):
            try:
                print(f"第 {attempt + 1} 次尝试连接...")
                start_time = time.time()
                
                # 发送HTTP请求获取页面内容，增加超时时间和重试
                response = requests.get(
                    request.url, 
                    headers=headers, 
                    timeout=timeout_settings,
                    verify=False,  # 临时禁用SSL验证，避免证书问题
                    allow_redirects=True
                )
                
                elapsed_time = time.time() - start_time
                print(f"请求成功，耗时: {elapsed_time:.2f}秒")
                response.raise_for_status()  # 如果请求失败，抛出异常
                break
                
            except requests.exceptions.Timeout as timeout_error:
                last_error = timeout_error
                print(f"第 {attempt + 1} 次尝试超时: {timeout_error}")
                if attempt < max_retries - 1:
                    wait_time = (attempt + 1) * 2  # 递增等待时间
                    print(f"等待 {wait_time} 秒后重试...")
                    time.sleep(wait_time)
                continue
                
            except requests.exceptions.ConnectionError as conn_error:
                last_error = conn_error
                print(f"第 {attempt + 1} 次尝试连接错误: {conn_error}")
                if attempt < max_retries - 1:
                    wait_time = (attempt + 1) * 2
                    print(f"等待 {wait_time} 秒后重试...")
                    time.sleep(wait_time)
                continue
                
            except requests.exceptions.RequestException as req_error:
                last_error = req_error
                print(f"第 {attempt + 1} 次尝试请求错误: {req_error}")
                if attempt < max_retries - 1:
                    wait_time = (attempt + 1) * 2
                    print(f"等待 {wait_time} 秒后重试...")
                    time.sleep(wait_time)
                continue
        
        # 如果所有重试都失败了
        if response is None:
            error_msg = f"经过 {max_retries} 次重试后仍然无法连接到 {request.url}"
            if last_error:
                error_msg += f"，最后错误: {str(last_error)}"
            print(error_msg)
            return JSONResponse(
                status_code=500,
                content={
                    "error": error_msg,
                    "suggestions": [
                        "检查服务器网络连接",
                        "确认目标网站是否可访问", 
                        "检查防火墙设置",
                        "考虑配置代理服务器",
                        "联系系统管理员检查网络配置"
                    ]
                },
            )
        
        # 设置编码以正确处理中文字符
        response.encoding = 'utf-8'
        
        # 解析HTML
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # 获取基础URL用于解析相对路径
        url_parts = urllib.parse.urlparse(request.url)
        base_url = f"{url_parts.scheme}://{url_parts.netloc}"
        
        # 初始化数据字典
        teacher_data = {
            "id": f"BLG{random.randint(10000, 99999)}",
            "photo": f"data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='100' height='120' viewBox='0 0 100 120'%3E%3Crect width='100' height='120' fill='%234986ff' opacity='0.3'/%3E%3Ccircle cx='50' cy='45' r='25' fill='%234986ff' opacity='0.6'/%3E%3Ccircle cx='50' cy='95' r='35' fill='%234986ff' opacity='0.6'/%3E%3C/svg%3E",
            "evaluationData": [
                round(min(100, max(60, 70 + 20 * (0.5 - random.random())))) for _ in range(6)
            ]
        }
        
        # 从教师信息表提取基本信息
        info_table = soup.find('div', class_='wz_teacher')
        if info_table:
            table = info_table.find('table')
            if table:
                rows = table.find_all('tr')
                
                # 提取姓名、性别、出生年月
                if len(rows) > 0:
                    cells = rows[0].find_all('td')
                    if len(cells) >= 6:
                        teacher_data["name"] = cells[1].text.strip()
                        teacher_data["gender"] = cells[3].text.strip()
                        teacher_data["birthDate"] = cells[5].text.strip()
                
                # 提取职称、职务、最高学历
                if len(rows) > 1:
                    cells = rows[1].find_all('td')
                    if len(cells) >= 6:
                        teacher_data["title"] = cells[1].text.strip()
                        position = cells[3].text.strip()
                        teacher_data["position"] = position if position else ""
                        teacher_data["education"] = cells[5].text.strip()
                
                # 提取学科方向
                if len(rows) > 2:
                    cells = rows[2].find_all('td')
                    if len(cells) >= 2:
                        teacher_data["academicDirection"] = cells[1].text.strip()
                
                # 提取人才计划和办公地点
                if len(rows) > 3:
                    cells = rows[3].find_all('td')
                    if len(cells) >= 6:
                        talent_plan = cells[1].text.strip()
                        teacher_data["talentPlan"] = talent_plan if talent_plan else ""
                        teacher_data["officeLocation"] = cells[5].text.strip()
                
                # 提取电子邮件和联系方式
                if len(rows) > 4:
                    cells = rows[4].find_all('td')
                    if len(cells) >= 6:
                        email = cells[1].text.strip()
                        teacher_data["email"] = email if email else ""
                        phone = cells[5].text.strip()
                        teacher_data["phone"] = phone if phone else ""
                
                # 提取通讯地址
                if len(rows) > 5:
                    cells = rows[5].find_all('td')
                    if len(cells) >= 2:
                        teacher_data["address"] = cells[1].text.strip()
                
                # 提取导师类型
                if len(rows) > 6:
                    cells = rows[6].find_all('td')
                    if len(cells) >= 2:
                        teacher_data["tutorType"] = cells[1].text.strip()
        
        # 提取照片
        photo_element = soup.select_one('.teacherInfo .img img')
        if photo_element and photo_element.get('src'):
            img_src = photo_element['src']
            
            # 处理相对路径，构建完整的图片URL
            if img_src.startswith('../../../'):
                # 从URL获取基础路径（移除文件名和最后两级目录）
                url_parts = request.url.split('/')
                if len(url_parts) >= 4:
                    base_path = '/'.join(url_parts[:-3])
                    img_url = f"{base_path}/{img_src[9:]}"  # 移除 '../../../'
                else:
                    img_url = urllib.parse.urljoin(base_url, img_src)
            else:
                img_url = urllib.parse.urljoin(base_url, img_src)
            
            # 直接保存完整的图片URL，不下载到本地
            teacher_data["photo"] = img_url
        
        # 提取详细信息部分
        content_divs = soup.select('.con01_t')
        for div in content_divs:
            heading = div.find('h3')
            if not heading:
                continue
                
            heading_text = heading.text.strip()
            
            # 获取该部分的所有段落文本
            paragraphs = [p.text.strip() for p in div.find_all('p') if p.text.strip()]
            section_content = '\n'.join(paragraphs)
            
            # 根据标题将内容映射到相应字段
            if '教育与工作经历' in heading_text:
                teacher_data["eduWorkHistory"] = section_content
            elif '研究方向' in heading_text:
                teacher_data["researchDirection"] = section_content
            elif '近5年承担的科研项目' in heading_text or '近五年承担的科研项目' in heading_text:
                teacher_data["recentProjects"] = section_content
                # 计算项目数量
                project_count = len([p for p in paragraphs if p.strip().startswith(str(len(paragraphs) - paragraphs.index(p))+".")])
                if project_count > 0:
                    teacher_data["projects"] = f"{project_count}项"
                else:
                    teacher_data["projects"] = f"{len(paragraphs)}项"
            elif '代表性学术论文' in heading_text:
                teacher_data["representativePapers"] = section_content
                # 计算论文数量
                paper_count = len([p for p in paragraphs if p.strip().startswith("[")])
                if paper_count > 0:
                    teacher_data["papers"] = f"{paper_count}篇"
                else:
                    teacher_data["papers"] = f"{len(paragraphs)}篇"
            elif '授权国家发明专利' in heading_text or '专利' in heading_text:
                teacher_data["patents"] = section_content
        
        print(f"抓取成功，提取到教师数据: {teacher_data.get('name', '未知')}")
        return teacher_data
        
    except Exception as e:
        print(f"抓取错误: {str(e)}")
        return JSONResponse(
            status_code=500,
            content={"error": f"抓取网页失败: {str(e)}"},
        )