CDN 日志分析

本文档提供 CDN 日志分析脚本，帮助您快速统计域名流量、客户端访问和地域分布等关键数据。

功能概述

CDN 日志分析脚本支持以下功能：

统计「域名+请求路径」流量 TOP10
统计「客户端 IP」流量 TOP10
统计「省份/地区」流量 TOP10
自动解析 .tar.gz 格式的日志文件
生成可读的分析报告文件

使用步骤

1. 准备日志文件

您可以前往腾讯云工单系统提交工单进行获取您当前环境的CDN日志

将 CDN 日志文件组织为以下目录结构：

your-workspace/
├── 20251110/          # 待分析的日期目录
│   ├── 00/
│   │   └── log.tar.gz
│   ├── 01/
│   └── ...
└── analyze.py         # 分析脚本

2. 运行分析脚本

将以下脚本保存为 analyze.py：

示例脚本

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
CDN日志分析脚本
统计域名+请求路径、客户端IP、省份/地区编号的流量TOP10
"""

import os
import sys
import tarfile
from collections import defaultdict
import time


# 境内省份映射表
PROVINCE_MAP = {
    '22': '北京', '86': '内蒙古', '146': '山西', '1208': '甘肃', '1467': '青海',
    '145': '黑龙江', '1464': '辽宁', '120': '江苏', '122': '山东', '1442': '浙江',
    '1135': '湖北', '1069': '河北', '1177': '天津', '119': '宁夏', '152': '陕西',
    '1468': '新疆', '1445': '吉林', '2': '福建', '121': '安徽', '1050': '上海',
    '182': '河南', '1465': '江西', '1466': '湖南', '118': '贵州', '153': '云南',
    '1051': '重庆', '1068': '四川', '1155': '西藏', '4': '广东', '173': '广西',
    '1441': '海南', '0': '其他', '1': '港澳台', '-1': '境外'
}

# 境外地区映射表
REGION_MAP = {
    # 服务地区
    '2000000001': '亚太一区', '2000000002': '亚太二区', '2000000003': '亚太三区',
    '2000000004': '中东', '2000000005': '北美', '2000000006': '欧洲',
    '2000000007': '南美', '2000000008': '非洲',
    # 客户端地区
    '-20': '亚洲', '-21': '南美洲', '-22': '北美洲', '-23': '欧洲', '-24': '非洲', '-25': '大洋洲',
    # 其他地区
    '-15': '亚洲其他', '-14': '南美洲其他', '-13': '北美洲其他',
    '-12': '欧洲其他', '-11': '非洲其他', '-10': '大洋洲其他', '-2': '境外其他',
    # 具体国家/地区
    '35': '尼泊尔', '57': '泰国', '73': '印度', '144': '越南', '192': '法国',
    '207': '英国', '208': '瑞典', '209': '德国', '213': '意大利', '214': '西班牙',
    '386': '阿联酋', '391': '以色列', '397': '乌克兰', '417': '哈萨克斯坦',
    '428': '葡萄牙', '443': '希腊', '471': '沙特阿拉伯', '529': '丹麦', '565': '伊朗',
    '578': '挪威', '669': '美国', '692': '叙利亚', '704': '塞浦路斯', '706': '捷克',
    '707': '瑞士', '708': '伊拉克', '714': '荷兰', '717': '罗马尼亚', '721': '黎巴嫩',
    '725': '匈牙利', '726': '格鲁吉亚', '731': '阿塞拜疆', '734': '奥地利',
    '736': '巴勒斯坦', '737': '土耳其', '759': '立陶宛', '763': '阿曼', '765': '斯洛伐克',
    '766': '塞尔维亚', '770': '芬兰', '773': '比利时', '809': '保加利亚',
    '811': '斯洛文尼亚', '812': '摩尔多瓦', '813': '马其顿', '824': '爱沙尼亚',
    '835': '克罗地亚', '837': '波兰', '852': '拉脱维亚', '857': '约旦',
    '884': '吉尔吉斯斯坦', '896': '爱尔兰', '901': '利比亚', '904': '亚美尼亚',
    '921': '也门', '971': '卢森堡', '1036': '新西兰', '1044': '日本', '1066': '巴基斯坦',
    '1070': '马耳他', '1091': '巴哈马', '1129': '阿根廷', '1134': '孟加拉',
    '1158': '柬埔寨', '1159': '中国澳门', '1176': '新加坡', '1179': '马尔代夫',
    '1180': '阿富汗', '1185': '斐济', '1186': '蒙古', '1195': '印度尼西亚',
    '1200': '中国香港', '1233': '卡塔尔', '1255': '冰岛', '1289': '阿尔巴尼亚',
    '1353': '乌兹别克斯坦', '1407': '圣马力诺', '1416': '科威特', '1417': '黑山',
    '1493': '塔吉克斯坦', '1501': '巴林', '1543': '智利', '1559': '南非', '1567': '埃及',
    '1590': '肯尼亚', '1592': '尼日利亚', '1598': '坦桑尼亚', '1611': '马达加斯加',
    '1613': '安哥拉', '1617': '科特迪瓦', '1620': '苏丹', '1681': '毛里求斯',
    '1693': '摩洛哥', '1695': '阿尔及利亚', '1698': '几内亚', '1730': '塞内加尔',
    '1864': '突尼斯', '1909': '乌拉圭', '1916': '格陵兰', '2026': '中国台湾',
    '2083': '缅甸', '2087': '文莱', '2094': '斯里兰卡', '2150': '巴拿马',
    '2175': '哥伦比亚', '2273': '摩纳哥', '2343': '安道尔', '2421': '土库曼斯坦',
    '2435': '老挝', '2488': '东帝汶', '2490': '汤加', '2588': '菲律宾',
    '2609': '委内瑞拉', '2612': '玻利维亚', '2613': '巴西', '2623': '哥斯达黎加',
    '2626': '墨西哥', '2639': '洪都拉斯', '2645': '萨尔瓦多', '2647': '巴拉圭',
    '2661': '秘鲁', '2728': '尼加拉瓜', '2734': '厄瓜多尔', '2768': '危地马拉',
    '2999': '阿鲁巴', '3058': '埃塞俄比亚', '3144': '波黑', '3216': '多米尼加',
    '3379': '韩国', '3701': '马来西亚', '3839': '加拿大', '4450': '澳大利亚',
    '4460': '中国港澳台'
}

# 运营商映射表
ISP_MAP = {
    '2': '中国电信', '26': '中国联通', '38': '教育网', '43': '长城宽带',
    '1046': '中国移动', '3947': '中国铁通', '0': '其它运营商', '-1': '境外运营商'
}


def get_province_name(code):
    """根据省份编号获取名称，优先查省份表，再查地区表"""
    code_str = str(code)
    if code_str in PROVINCE_MAP:
        return PROVINCE_MAP[code_str]
    if code_str in REGION_MAP:
        return REGION_MAP[code_str]
    return f"未知({code})"


def get_region_name(code):
    """根据地区编号获取名称，优先查省份表，再查地区表"""
    code_str = str(code)
    if code_str in PROVINCE_MAP:
        return PROVINCE_MAP[code_str]
    if code_str in REGION_MAP:
        return REGION_MAP[code_str]
    return f"未知地区({code})"


def format_bytes(bytes_size):
    """将字节数格式化为可读的单位"""
    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
        if bytes_size < 1024:
            return f"{bytes_size:.2f}{unit}"
        bytes_size /= 1024
    return f"{bytes_size:.2f}PB"


def parse_cdn_log_line(line):
    """
    解析CDN日志行（17字段格式，比腾讯云标准多1个字段）
    
    字段顺序：
    1. 请求时间
    2. 客户端IP
    3. 域名
    4. 请求路径
    5. 访问字节数
    6. 省份编号 ← 额外字段
    7. 地区编号
    8. 运营商ID
    9. HTTP状态码
    10. Referer
    11. 响应时间
    12. User-Agent
    13. Range参数
    14. HTTP Method
    15. 协议标识
    16. 缓存HIT/MISS
    17. 客户端端口
    """
    parts = line.split()
    if len(parts) < 11:
        return None
    
    try:
        client_ip = parts[1]          # 客户端IP
        domain = parts[2]             # 域名
        request_path = parts[3]       # 请求路径
        bytes_size = int(parts[4])    # 字节数
        province_code = parts[5]      # 省份编号（额外字段）
        region_code = parts[6]        # 地区编号
        
        return {
            'client_ip': client_ip,
            'domain': domain,
            'request_path': request_path,
            'bytes_size': bytes_size,
            'province_code': province_code,
            'region_code': region_code
        }
    except (IndexError, ValueError):
        return None


def analyze_tar_gz_file(file_path, domain_path_traffic, client_ip_traffic, province_traffic, region_traffic):
    """分析单个tar.gz文件"""
    total_bytes = 0
    parsed_lines = 0
    
    try:
        with tarfile.open(file_path, 'r:gz') as tar:
            for member in tar.getmembers():
                if member.isfile() and member.name.endswith('.log'):
                    log_file = tar.extractfile(member)
                    if log_file:
                        content = log_file.read().decode('utf-8', errors='ignore')
                        lines = content.strip().split('\n')
                        
                        for line in lines:
                            if not line.strip():
                                continue
                            
                            parsed_data = parse_cdn_log_line(line)
                            if parsed_data:
                                bytes_size = parsed_data['bytes_size']
                                
                                # 统计域名+请求路径流量
                                domain_path = f"{parsed_data['domain']}{parsed_data['request_path']}"
                                domain_path_traffic[domain_path] += bytes_size
                                
                                # 统计客户端IP流量
                                client_ip_traffic[parsed_data['client_ip']] += bytes_size
                                
                                # 统计省份编号流量
                                province_traffic[parsed_data['province_code']] += bytes_size
                                
                                # 统计地区编号流量
                                region_traffic[parsed_data['region_code']] += bytes_size
                                
                                total_bytes += bytes_size
                                parsed_lines += 1
    
    except Exception as e:
        print(f"处理文件 {file_path} 时出错: {e}")
    
    return total_bytes, parsed_lines


def analyze_all_logs(log_dir='20251110'):
    """分析所有CDN日志文件"""
    print(f"开始分析 {log_dir} 目录下的CDN日志...")
    
    domain_path_traffic = defaultdict(int)
    client_ip_traffic = defaultdict(int)
    province_traffic = defaultdict(int)
    region_traffic = defaultdict(int)
    total_bytes = 0
    total_parsed = 0
    
    # 遍历所有小时目录
    for hour_dir in sorted(os.listdir(log_dir)):
        hour_path = os.path.join(log_dir, hour_dir)
        if not os.path.isdir(hour_path):
            continue
        
        print(f"处理 {hour_dir} 小时的日志...")
        
        for filename in os.listdir(hour_path):
            if filename.endswith('.tar.gz'):
                file_path = os.path.join(hour_path, filename)
                bytes_count, parsed = analyze_tar_gz_file(
                    file_path, domain_path_traffic, client_ip_traffic, province_traffic, region_traffic
                )
                total_bytes += bytes_count
                total_parsed += parsed
    
    print(f"\n处理完成: 总记录数 {total_parsed:,}, 总流量 {format_bytes(total_bytes)}")
    
    return domain_path_traffic, client_ip_traffic, province_traffic, region_traffic, total_bytes


def get_top10(traffic_dict):
    """获取流量TOP10"""
    sorted_items = sorted(traffic_dict.items(), key=lambda x: x[1], reverse=True)
    return sorted_items[:10]


def write_top10_results(domain_path_traffic, client_ip_traffic, province_traffic, region_traffic, total_bytes, log_date):
    """写入TOP10结果到文件"""
    
    # 创建输出目录
    output_dir = f"output_{log_date}"
    os.makedirs(output_dir, exist_ok=True)
    
    def calc_percentage(bytes_size):
        return f"{(bytes_size / total_bytes * 100):.2f}%" if total_bytes > 0 else "0.00%"
    
    # TOP10 域名+请求路径
    with open(os.path.join(output_dir, "top10_domain_path.txt"), 'w', encoding='utf-8') as f:
        f.write("域名+请求路径\tCDN流量\t占比\n")
        for domain_path, bytes_size in get_top10(domain_path_traffic):
            f.write(f"{domain_path}\t{format_bytes(bytes_size)}\t{calc_percentage(bytes_size)}\n")
    
    # TOP10 客户端IP
    with open(os.path.join(output_dir, "top10_client_ip.txt"), 'w', encoding='utf-8') as f:
        f.write("客户端IP\tCDN流量\t占比\n")
        for client_ip, bytes_size in get_top10(client_ip_traffic):
            f.write(f"{client_ip}\t{format_bytes(bytes_size)}\t{calc_percentage(bytes_size)}\n")
    
    # TOP10 省份
    with open(os.path.join(output_dir, "top10_province.txt"), 'w', encoding='utf-8') as f:
        f.write("省份\t编号\tCDN流量\t占比\n")
        for province_code, bytes_size in get_top10(province_traffic):
            province_name = get_province_name(province_code)
            f.write(f"{province_name}\t{province_code}\t{format_bytes(bytes_size)}\t{calc_percentage(bytes_size)}\n")
    
    # TOP10 地区
    with open(os.path.join(output_dir, "top10_region.txt"), 'w', encoding='utf-8') as f:
        f.write("地区\t编号\tCDN流量\t占比\n")
        for region_code, bytes_size in get_top10(region_traffic):
            region_name = get_region_name(region_code)
            f.write(f"{region_name}\t{region_code}\t{format_bytes(bytes_size)}\t{calc_percentage(bytes_size)}\n")
    
    print(f"\n结果文件已生成到目录: {output_dir}/")
    print(f"- {output_dir}/top10_domain_path.txt")
    print(f"- {output_dir}/top10_client_ip.txt")
    print(f"- {output_dir}/top10_province.txt")
    print(f"- {output_dir}/top10_region.txt")


def print_preview(domain_path_traffic, client_ip_traffic, province_traffic, region_traffic, total_bytes):
    """打印TOP10预览"""
    
    def calc_percentage(bytes_size):
        return f"{(bytes_size / total_bytes * 100):.2f}%" if total_bytes > 0 else "0.00%"
    
    print("\n" + "="*80)
    print("TOP10 域名+请求路径:")
    print("="*80)
    for i, (domain_path, bytes_size) in enumerate(get_top10(domain_path_traffic), 1):
        display = domain_path[:60] + "..." if len(domain_path) > 60 else domain_path
        print(f"{i:2d}. {display:<63} {format_bytes(bytes_size):>10} ({calc_percentage(bytes_size)})")
    
    print("\n" + "="*60)
    print("TOP10 客户端IP:")
    print("="*60)
    for i, (client_ip, bytes_size) in enumerate(get_top10(client_ip_traffic), 1):
        print(f"{i:2d}. {client_ip:<15} {format_bytes(bytes_size):>10} ({calc_percentage(bytes_size)})")
    
    print("\n" + "="*60)
    print("TOP10 省份:")
    print("="*60)
    for i, (province_code, bytes_size) in enumerate(get_top10(province_traffic), 1):
        province_name = get_province_name(province_code)
        print(f"{i:2d}. {province_name:<10} ({province_code:<6}) {format_bytes(bytes_size):>10} ({calc_percentage(bytes_size)})")
    
    print("\n" + "="*60)
    print("TOP10 地区:")
    print("="*60)
    for i, (region_code, bytes_size) in enumerate(get_top10(region_traffic), 1):
        region_name = get_region_name(region_code)
        print(f"{i:2d}. {region_name:<12} ({region_code:<6}) {format_bytes(bytes_size):>10} ({calc_percentage(bytes_size)})")


def select_log_directory():
    """让用户选择要分析的日志目录"""
    # 查找当前目录下所有可能的日志目录（以数字开头的目录）
    log_dirs = []
    for name in sorted(os.listdir('.')):
        if os.path.isdir(name) and name.isdigit() and len(name) == 8:
            log_dirs.append(name)
    
    if not log_dirs:
        print("错误: 当前目录下没有找到日志目录（格式如20251110）")
        return None
    
    if len(log_dirs) == 1:
        print(f"找到日志目录: {log_dirs[0]}")
        return log_dirs[0]
    
    print("请选择要分析的日志目录:")
    for i, dir_name in enumerate(log_dirs, 1):
        print(f"  {i}. {dir_name}")
    
    while True:
        try:
            choice = input(f"请输入序号 (1-{len(log_dirs)}): ").strip()
            idx = int(choice) - 1
            if 0 <= idx < len(log_dirs):
                return log_dirs[idx]
            print(f"请输入 1-{len(log_dirs)} 之间的数字")
        except ValueError:
            print("请输入有效的数字")
        except KeyboardInterrupt:
            print("\n已取消")
            return None


def main():
    print("CDN日志流量分析工具")
    print("=" * 50)
    
    # 支持命令行参数指定日志目录
    if len(sys.argv) > 1:
        log_dir = sys.argv[1]
        if not os.path.isdir(log_dir):
            print(f"错误: 目录 {log_dir} 不存在")
            return
        print(f"使用指定目录: {log_dir}")
    else:
        log_dir = select_log_directory()
        if not log_dir:
            return
    
    start_time = time.time()
    
    domain_path_traffic, client_ip_traffic, province_traffic, region_traffic, total_bytes = analyze_all_logs(log_dir)
    
    if total_bytes == 0:
        print("警告: 没有解析到任何有效的日志记录")
        return
    
    write_top10_results(domain_path_traffic, client_ip_traffic, province_traffic, region_traffic, total_bytes, log_dir)
    print_preview(domain_path_traffic, client_ip_traffic, province_traffic, region_traffic, total_bytes)
    
    print(f"\n总流量: {format_bytes(total_bytes)}")
    print(f"分析完成，耗时: {time.time() - start_time:.2f} 秒")


if __name__ == '__main__':
    main()

执行脚本：

python3 analyze.py

3. 查看分析结果

脚本执行完成后，会在当前目录下生成 output_YYYYMMDD 目录，包含以下文件：

output_20251110/
├── top10_domain_path.txt    # 域名+路径流量 TOP10
├── top10_client_ip.txt      # 客户端 IP 流量 TOP10
├── top10_province.txt       # 省份流量 TOP10
└── top10_region.txt         # 地区流量 TOP10

4. 结果文件示例

top10_domain_path.txt - 域名+请求路径流量统计：

域名+请求路径 CDN流量   占比
www.example.com/index.html  123.45MB    35.20%
www.example.com/assets/main.js  89.23MB 25.45%

top10_client_ip.txt - 客户端 IP 流量统计：

客户端IP   CDN流量   占比
192.168.1.100   234.56MB    15.30%
10.0.0.50   198.72MB    12.95%

top10_province.txt - 省份流量统计：

省份  编号  CDN流量   占比
广东  4   456.78MB    28.50%
北京  22  389.45MB    24.30%

top10_region.txt - 地区流量统计：

地区  编号  CDN流量   占比
亚太一区    2000000001  567.89MB    32.10%
北美  2000000005  423.56MB    23.95%

CDN 日志分析

功能概述​

使用步骤​

1. 准备日志文件​

2. 运行分析脚本​

3. 查看分析结果​

4. 结果文件示例​