CDN Log Analysis
This document provides a CDN log analysis script to help you quickly analyze domain traffic, client access patterns, and geographic distribution statistics.
Feature Overview
The CDN log analysis script supports the following features:
- Statistics for "Domain + Request Path" traffic TOP10
- Statistics for "Client IP" traffic TOP10
- Statistics for "Province/Region" traffic TOP10
- Automatic parsing of
.tar.gzformat log files - Generation of readable analysis report files
Usage Steps
1. Prepare Log Files
You can submit a ticket at Tencent Cloud Ticket System to obtain CDN logs for your current environment.
Organize the CDN log files into the following directory structure:
your-workspace/
├── 20251110/ # Date directory to be analyzed
│ ├── 00/
│ │ └── log.tar.gz
│ ├── 01/
│ └── ...
└── analyze.py # Analysis script
2. Run Analysis Script
Save the following script as analyze.py:
Example Script
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
CDN Log Analysis Script
Statistics for domain+request path, client IP, province/region traffic TOP10
"""
import os
import sys
import tarfile
from collections import defaultdict
import time
# Domestic province mapping table
PROVINCE_MAP = {
'22': '北京', '86': '内蒙古', '146': '山西', '1208': '甘肃', '1467': '青海',
'145': '黑龙江', '1464': '辽宁', '120': '江苏', '122': '山东', '1442': '浙江',
'1135': '湖北', '1069': '河北', '1177': '天津', '119': '宁夏', '152': '陕西',
'1468': '新疆', '1445': '吉林', '2': '福建', '121': '安徽', '1050': '上海',
'182': '河南', '1465': '江西', '1466': '湖南', '118': '贵州', '153': '云南',
'1051': '重庆', '1068': '四川', '1155': '西藏', '4': '广东', '173': '广西',
'1441': '海南', '0': '其他', '1': '港澳台', '-1': '境外'
}
# Overseas region mapping table
REGION_MAP = {
# Service regions
'2000000001': '亚太一区', '2000000002': '亚太二区', '2000000003': '亚太三区',
'2000000004': '中东', '2000000005': '北美', '2000000006': '欧洲',
'2000000007': '南美', '2000000008': '非洲',
# Client regions
'-20': '亚洲', '-21': '南美洲', '-22': '北美洲', '-23': '欧洲', '-24': '非洲', '-25': '大洋洲',
# Other regions
'-15': '亚洲其他', '-14': '南美洲其他', '-13': '北美洲其他',
'-12': '欧洲其他', '-11': '非洲其他', '-10': '大洋洲其他', '-2': '境外其他',
# Specific countries/regions
'35': '尼泊尔', '57': '泰国', '73': '印度', '144': '越南', '192': '法国',
'207': '英国', '208': '瑞典', '209': '德国', '213': '意大利', '214': '西班牙',
'386': '阿联酋', '391': '以色列', '397': '乌克兰', '417': '哈萨克斯坦',
'428': '葡萄牙', '443': '希腊', '471': '沙特阿拉伯', '529': '丹麦', '565': '伊朗',
'578': '挪威', '669': '美国', '692': '叙利亚', '704': '塞浦路斯', '706': '捷克',
'707': '瑞士', '708': '伊拉克', '714': '荷兰', '717': '罗马尼亚', '721': '黎巴嫩',
'725': '匈牙利', '726': '格鲁吉亚', '731': '阿塞拜疆', '734': '奥地利',
'736': '巴勒斯坦', '737': '土耳其', '759': '立陶宛', '763': '阿曼', '765': '斯洛伐克',
'766': '塞尔维亚', '770': '芬兰', '773': '比利时', '809': '保加利亚',
'811': '斯洛文尼亚', '812': '摩尔多瓦', '813': '马其顿', '824': '爱沙尼亚',
'835': '克罗地亚', '837': '波兰', '852': '拉脱维亚', '857': '约旦',
'884': '吉尔吉斯斯坦', '896': '爱尔兰', '901': '利比亚', '904': '亚美尼亚',
'921': '也门', '971': '卢森堡', '1036': '新西兰', '1044': '日本', '1066': '巴基斯坦',
'1070': '马耳他', '1091': '巴哈马', '1129': '阿根廷', '1134': '孟加拉',
'1158': '柬埔寨', '1159': '中国澳门', '1176': '新加坡', '1179': '马尔代夫',
'1180': '阿富汗', '1185': '斐济', '1186': '蒙古', '1195': '印度尼西亚',
'1200': '中国香港', '1233': '卡塔尔', '1255': '冰岛', '1289': '阿尔巴尼亚',
'1353': '乌兹别克斯坦', '1407': '圣马力诺', '1416': '科威特', '1417': '黑山',
'1493': '塔吉克斯坦', '1501': '巴林', '1543': '智利', '1559': '南非', '1567': '埃及',
'1590': '肯尼亚', '1592': '尼日利亚', '1598': '坦桑尼亚', '1611': '马达加斯加',
'1613': '安哥拉', '1617': '科特迪瓦', '1620': '苏丹', '1681': '毛里求斯',
'1693': '摩洛哥', '1695': '阿尔及利亚', '1698': '几内亚', '1730': '塞内加尔',
'1864': '突尼斯', '1909': '乌拉圭', '1916': '格陵兰', '2026': '中国台湾',
'2083': '缅甸', '2087': '文莱', '2094': '斯里兰卡', '2150': '巴拿马',
'2175': '哥伦比亚', '2273': '摩纳哥', '2343': '安道尔', '2421': '土库曼斯坦',
'2435': '老挝', '2488': '东帝汶', '2490': '汤加', '2588': '菲律宾',
'2609': '委内瑞拉', '2612': '玻利维亚', '2613': '巴西', '2623': '哥斯达黎加',
'2626': '墨西哥', '2639': '洪都拉斯', '2645': '萨尔瓦多', '2647': '巴拉圭',
'2661': '秘鲁', '2728': '尼加拉瓜', '2734': '厄瓜多尔', '2768': '危地马拉',
'2999': '阿鲁巴', '3058': '埃塞俄比亚', '3144': '波黑', '3216': '多米尼加',
'3379': '韩国', '3701': '马来西亚', '3839': '加拿大', '4450': '澳大利亚',
'4460': '中国港澳台'
}
# ISP mapping table
ISP_MAP = {
'2': '中国电信', '26': '中国联通', '38': '教育网', '43': '长城宽带',
'1046': '中国移动', '3947': '中国铁通', '0': '其它运营商', '-1': '境外运营商'
}
def get_province_name(code):
"""Get province name by code, check province table first, then region table"""
code_str = str(code)
if code_str in PROVINCE_MAP:
return PROVINCE_MAP[code_str]
if code_str in REGION_MAP:
return REGION_MAP[code_str]
return f"未知({code})"
def get_region_name(code):
"""Get region name by code, check province table first, then region table"""
code_str = str(code)
if code_str in PROVINCE_MAP:
return PROVINCE_MAP[code_str]
if code_str in REGION_MAP:
return REGION_MAP[code_str]
return f"未知地区({code})"
def format_bytes(bytes_size):
"""Format bytes to readable units"""
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
if bytes_size < 1024:
return f"{bytes_size:.2f}{unit}"
bytes_size /= 1024
return f"{bytes_size:.2f}PB"
def parse_cdn_log_line(line):
"""
Parse CDN log line (17-field format, 1 extra field compared to Tencent Cloud standard)
Field order:
1. Request time
2. Client IP
3. Domain
4. Request path
5. Bytes transferred
6. Province code ← Extra field
7. Region code
8. ISP ID
9. HTTP status code
10. Referer
11. Response time
12. User-Agent
13. Range parameter
14. HTTP Method
15. Protocol identifier
16. Cache HIT/MISS
17. Client port
"""
parts = line.split()
if len(parts) < 11:
return None
try:
client_ip = parts[1] # Client IP
domain = parts[2] # Domain
request_path = parts[3] # Request path
bytes_size = int(parts[4]) # Bytes
province_code = parts[5] # Province code (extra field)
region_code = parts[6] # Region code
return {
'client_ip': client_ip,
'domain': domain,
'request_path': request_path,
'bytes_size': bytes_size,
'province_code': province_code,
'region_code': region_code
}
except (IndexError, ValueError):
return None
def analyze_tar_gz_file(file_path, domain_path_traffic, client_ip_traffic, province_traffic, region_traffic):
"""Analyze a single tar.gz file"""
total_bytes = 0
parsed_lines = 0
try:
with tarfile.open(file_path, 'r:gz') as tar:
for member in tar.getmembers():
if member.isfile() and member.name.endswith('.log'):
log_file = tar.extractfile(member)
if log_file:
content = log_file.read().decode('utf-8', errors='ignore')
lines = content.strip().split('\n')
for line in lines:
if not line.strip():
continue
parsed_data = parse_cdn_log_line(line)
if parsed_data:
bytes_size = parsed_data['bytes_size']
# Count domain+request path traffic
domain_path = f"{parsed_data['domain']}{parsed_data['request_path']}"
domain_path_traffic[domain_path] += bytes_size
# Count client IP traffic
client_ip_traffic[parsed_data['client_ip']] += bytes_size
# Count province code traffic
province_traffic[parsed_data['province_code']] += bytes_size
# Count region code traffic
region_traffic[parsed_data['region_code']] += bytes_size
total_bytes += bytes_size
parsed_lines += 1
except Exception as e:
print(f"处理文件 {file_path} 时出错: {e}")
return total_bytes, parsed_lines
def analyze_all_logs(log_dir='20251110'):
"""Analyze all CDN log files"""
print(f"开始分析 {log_dir} 目录下的CDN日志...")
domain_path_traffic = defaultdict(int)
client_ip_traffic = defaultdict(int)
province_traffic = defaultdict(int)
region_traffic = defaultdict(int)
total_bytes = 0
total_parsed = 0
# Iterate through all hour directories
for hour_dir in sorted(os.listdir(log_dir)):
hour_path = os.path.join(log_dir, hour_dir)
if not os.path.isdir(hour_path):
continue
print(f"处理 {hour_dir} 小时的日志...")
for filename in os.listdir(hour_path):
if filename.endswith('.tar.gz'):
file_path = os.path.join(hour_path, filename)
bytes_count, parsed = analyze_tar_gz_file(
file_path, domain_path_traffic, client_ip_traffic, province_traffic, region_traffic
)
total_bytes += bytes_count
total_parsed += parsed
print(f"\n处理完成: 总记录数 {total_parsed:,}, 总流量 {format_bytes(total_bytes)}")
return domain_path_traffic, client_ip_traffic, province_traffic, region_traffic, total_bytes
def get_top10(traffic_dict):
"""Get traffic TOP10"""
sorted_items = sorted(traffic_dict.items(), key=lambda x: x[1], reverse=True)
return sorted_items[:10]
def write_top10_results(domain_path_traffic, client_ip_traffic, province_traffic, region_traffic, total_bytes, log_date):
"""Write TOP10 results to files"""
# Create output directory
output_dir = f"output_{log_date}"
os.makedirs(output_dir, exist_ok=True)
def calc_percentage(bytes_size):
return f"{(bytes_size / total_bytes * 100):.2f}%" if total_bytes > 0 else "0.00%"
# TOP10 domain+request path
with open(os.path.join(output_dir, "top10_domain_path.txt"), 'w', encoding='utf-8') as f:
f.write("域名+请求路径\tCDN流量\t占比\n")
for domain_path, bytes_size in get_top10(domain_path_traffic):
f.write(f"{domain_path}\t{format_bytes(bytes_size)}\t{calc_percentage(bytes_size)}\n")
# TOP10 client IP
with open(os.path.join(output_dir, "top10_client_ip.txt"), 'w', encoding='utf-8') as f:
f.write("客户端IP\tCDN流量\t占比\n")
for client_ip, bytes_size in get_top10(client_ip_traffic):
f.write(f"{client_ip}\t{format_bytes(bytes_size)}\t{calc_percentage(bytes_size)}\n")
# TOP10 province
with open(os.path.join(output_dir, "top10_province.txt"), 'w', encoding='utf-8') as f:
f.write("省份\t编号\tCDN流量\t占比\n")
for province_code, bytes_size in get_top10(province_traffic):
province_name = get_province_name(province_code)
f.write(f"{province_name}\t{province_code}\t{format_bytes(bytes_size)}\t{calc_percentage(bytes_size)}\n")
# TOP10 region
with open(os.path.join(output_dir, "top10_region.txt"), 'w', encoding='utf-8') as f:
f.write("地区\t编号\tCDN流量\t占比\n")
for region_code, bytes_size in get_top10(region_traffic):
region_name = get_region_name(region_code)
f.write(f"{region_name}\t{region_code}\t{format_bytes(bytes_size)}\t{calc_percentage(bytes_size)}\n")
print(f"\n结果文件已生成到目录: {output_dir}/")
print(f"- {output_dir}/top10_domain_path.txt")
print(f"- {output_dir}/top10_client_ip.txt")
print(f"- {output_dir}/top10_province.txt")
print(f"- {output_dir}/top10_region.txt")
def print_preview(domain_path_traffic, client_ip_traffic, province_traffic, region_traffic, total_bytes):
"""Print TOP10 preview"""
def calc_percentage(bytes_size):
return f"{(bytes_size / total_bytes * 100):.2f}%" if total_bytes > 0 else "0.00%"
print("\n" + "="*80)
print("TOP10 域名+请求路径:")
print("="*80)
for i, (domain_path, bytes_size) in enumerate(get_top10(domain_path_traffic), 1):
display = domain_path[:60] + "..." if len(domain_path) > 60 else domain_path
print(f"{i:2d}. {display:<63} {format_bytes(bytes_size):>10} ({calc_percentage(bytes_size)})")
print("\n" + "="*60)
print("TOP10 客户端IP:")
print("="*60)
for i, (client_ip, bytes_size) in enumerate(get_top10(client_ip_traffic), 1):
print(f"{i:2d}. {client_ip:<15} {format_bytes(bytes_size):>10} ({calc_percentage(bytes_size)})")
print("\n" + "="*60)
print("TOP10 省份:")
print("="*60)
for i, (province_code, bytes_size) in enumerate(get_top10(province_traffic), 1):
province_name = get_province_name(province_code)
print(f"{i:2d}. {province_name:<10} ({province_code:<6}) {format_bytes(bytes_size):>10} ({calc_percentage(bytes_size)})")
print("\n" + "="*60)
print("TOP10 地区:")
print("="*60)
for i, (region_code, bytes_size) in enumerate(get_top10(region_traffic), 1):
region_name = get_region_name(region_code)
print(f"{i:2d}. {region_name:<12} ({region_code:<6}) {format_bytes(bytes_size):>10} ({calc_percentage(bytes_size)})")
def select_log_directory():
"""Let user select log directory to analyze"""
# Find all possible log directories in current directory (directories starting with numbers)
log_dirs = []
for name in sorted(os.listdir('.')):
if os.path.isdir(name) and name.isdigit() and len(name) == 8:
log_dirs.append(name)
if not log_dirs:
print("错误: 当前目录下没有找到日志目录(格式如20251110)")
return None
if len(log_dirs) == 1:
print(f"找到日志目录: {log_dirs[0]}")
return log_dirs[0]
print("请选择要分析的日志目录:")
for i, dir_name in enumerate(log_dirs, 1):
print(f" {i}. {dir_name}")
while True:
try:
choice = input(f"请输入序号 (1-{len(log_dirs)}): ").strip()
idx = int(choice) - 1
if 0 <= idx < len(log_dirs):
return log_dirs[idx]
print(f"请输入 1-{len(log_dirs)} 之间的数字")
except ValueError:
print("请输入有效的数字")
except KeyboardInterrupt:
print("\n已取消")
return None
def main():
print("CDN日志流量分析工具")
print("=" * 50)
# Support command line argument to specify log directory
if len(sys.argv) > 1:
log_dir = sys.argv[1]
if not os.path.isdir(log_dir):
print(f"错误: 目录 {log_dir} 不存在")
return
print(f"使用指定目录: {log_dir}")
else:
log_dir = select_log_directory()
if not log_dir:
return
start_time = time.time()
domain_path_traffic, client_ip_traffic, province_traffic, region_traffic, total_bytes = analyze_all_logs(log_dir)
if total_bytes == 0:
print("警告: 没有解析到任何有效的日志记录")
return
write_top10_results(domain_path_traffic, client_ip_traffic, province_traffic, region_traffic, total_bytes, log_dir)
print_preview(domain_path_traffic, client_ip_traffic, province_traffic, region_traffic, total_bytes)
print(f"\n总流量: {format_bytes(total_bytes)}")
print(f"分析完成,耗时: {time.time() - start_time:.2f} 秒")
if __name__ == '__main__':
main()
Execute the script:
python3 analyze.py
3. View Analysis Results
After the script completes, an output_YYYYMMDD directory will be generated in the current directory, containing the following files:
output_20251110/
├── top10_domain_path.txt # Domain+path traffic TOP10
├── top10_client_ip.txt # Client IP traffic TOP10
├── top10_province.txt # Province traffic TOP10
└── top10_region.txt # Region traffic TOP10
4. Result File Examples
top10_domain_path.txt - Domain+request path traffic statistics:
域名+请求路径 CDN流量 占比
www.example.com/index.html 123.45MB 35.20%
www.example.com/assets/main.js 89.23MB 25.45%
top10_client_ip.txt - Client IP traffic statistics:
客户端IP CDN流量 占比
192.168.1.100 234.56MB 15.30%
10.0.0.50 198.72MB 12.95%
top10_province.txt - Province traffic statistics:
省份 编号 CDN流量 占比
广东 4 456.78MB 28.50%
北京 22 389.45MB 24.30%
top10_region.txt - Region traffic statistics:
地区 编号 CDN流量 占比
亚太一区 2000000001 567.89MB 32.10%
北美 2000000005 423.56MB 23.95%