182 lines
5.1 KiB
Bash
Executable File
182 lines
5.1 KiB
Bash
Executable File
#!/bin/bash
|
||
|
||
# Git大文件扫描脚本
|
||
# 用于查找仓库中的大体积文件
|
||
|
||
set -e
|
||
|
||
echo "🔍 Git仓库大文件扫描"
|
||
echo "====================="
|
||
|
||
# 检查是否在Git仓库中
|
||
if ! git rev-parse --git-dir > /dev/null 2>&1; then
|
||
echo "❌ 当前目录不是Git仓库"
|
||
exit 1
|
||
fi
|
||
|
||
# 函数:格式化文件大小
|
||
format_size() {
|
||
local size=$1
|
||
if [ $size -gt 1073741824 ]; then
|
||
echo "$(echo "scale=2; $size/1073741824" | bc)GB"
|
||
elif [ $size -gt 1048576 ]; then
|
||
echo "$(echo "scale=2; $size/1048576" | bc)MB"
|
||
elif [ $size -gt 1024 ]; then
|
||
echo "$(echo "scale=2; $size/1024" | bc)KB"
|
||
else
|
||
echo "${size}B"
|
||
fi
|
||
}
|
||
|
||
echo "📊 分析当前工作目录中的大文件..."
|
||
echo ""
|
||
|
||
# 1. 扫描当前工作目录中的大文件(包括未跟踪的)
|
||
echo "🗂️ 当前目录大文件 (>1MB):"
|
||
echo "文件大小 文件路径"
|
||
echo "-------- --------"
|
||
|
||
find . -type f -size +1M -not -path "./.git/*" -exec ls -lh {} \; | \
|
||
awk '{print $5 "\t" $9}' | \
|
||
sort -hr | \
|
||
head -20
|
||
|
||
echo ""
|
||
|
||
# 2. 扫描Git跟踪的大文件
|
||
echo "📋 Git跟踪的大文件 (>1MB):"
|
||
echo "文件大小 文件路径"
|
||
echo "-------- --------"
|
||
|
||
git ls-files | while read file; do
|
||
if [ -f "$file" ]; then
|
||
size=$(stat -f%z "$file" 2>/dev/null || stat -c%s "$file" 2>/dev/null || echo 0)
|
||
if [ $size -gt 1048576 ]; then # 1MB
|
||
size_formatted=$(format_size $size)
|
||
printf "%-10s %s\n" "$size_formatted" "$file"
|
||
fi
|
||
fi
|
||
done | sort -hr | head -20
|
||
|
||
echo ""
|
||
|
||
# 3. 扫描Git历史中的大文件(这个比较耗时)
|
||
echo "🕰️ 扫描Git历史中的大文件..."
|
||
echo "注意:这可能需要一些时间..."
|
||
echo ""
|
||
|
||
echo "📚 Git历史中的大文件 (>1MB):"
|
||
echo "文件大小 提交次数 文件路径"
|
||
echo "-------- -------- --------"
|
||
|
||
# 获取所有文件的历史大小信息
|
||
git rev-list --objects --all | \
|
||
git cat-file --batch-check='%(objecttype) %(objectname) %(objectsize) %(rest)' | \
|
||
awk '/^blob/ {print $3 "\t" $4}' | \
|
||
sort -nr | \
|
||
head -30 | \
|
||
while IFS=$'\t' read size path; do
|
||
if [ $size -gt 1048576 ]; then # 1MB
|
||
size_formatted=$(format_size $size)
|
||
# 计算文件在历史中出现的次数
|
||
count=$(git log --all --pretty=format: --name-only -- "$path" 2>/dev/null | grep -c "^$path$" || echo 0)
|
||
printf "%-10s %-8s %s\n" "$size_formatted" "$count" "$path"
|
||
fi
|
||
done
|
||
|
||
echo ""
|
||
|
||
# 4. 显示仓库总体积信息
|
||
echo "📈 仓库体积统计:"
|
||
echo "----------------"
|
||
|
||
# .git目录大小
|
||
git_size=$(du -sh .git 2>/dev/null | cut -f1)
|
||
echo "Git仓库大小: $git_size"
|
||
|
||
# 工作目录大小(不包括.git)
|
||
work_size=$(du -sh --exclude=.git . 2>/dev/null | cut -f1 || du -sh . | cut -f1)
|
||
echo "工作目录大小: $work_size"
|
||
|
||
# 统计各类文件
|
||
echo ""
|
||
echo "📁 文件类型统计 (>100KB):"
|
||
echo "文件类型 数量 总大小"
|
||
echo "-------- ---- ------"
|
||
|
||
find . -type f -not -path "./.git/*" -exec file {} \; | \
|
||
sed 's/.*: //' | \
|
||
sort | uniq -c | \
|
||
sort -nr | \
|
||
head -10 | \
|
||
while read count type; do
|
||
printf "%-10s %-6s %s\n" "${type:0:10}" "$count" "$(echo "$type" | cut -d' ' -f1)"
|
||
done
|
||
|
||
echo ""
|
||
|
||
# 5. 建议清理的文件类型
|
||
echo "💡 建议添加到.gitignore的文件类型:"
|
||
echo "----------------------------------------"
|
||
|
||
# 查找常见的应该被忽略的大文件
|
||
echo "🔍 发现的可能需要忽略的文件:"
|
||
|
||
# 编译文件
|
||
compiled_files=$(find . -name "*.pyc" -o -name "*.pyo" -o -name "*.class" -o -name "*.o" -o -name "*.so" -o -name "*.dll" | head -5)
|
||
if [ ! -z "$compiled_files" ]; then
|
||
echo "📄 编译文件:"
|
||
echo "$compiled_files"
|
||
fi
|
||
|
||
# 日志文件
|
||
log_files=$(find . -name "*.log" -o -name "*.log.*" | head -5)
|
||
if [ ! -z "$log_files" ]; then
|
||
echo "📄 日志文件:"
|
||
echo "$log_files"
|
||
fi
|
||
|
||
# 临时文件
|
||
temp_files=$(find . -name "*.tmp" -o -name "*.temp" -o -name "*~" -o -name ".DS_Store" | head -5)
|
||
if [ ! -z "$temp_files" ]; then
|
||
echo "📄 临时文件:"
|
||
echo "$temp_files"
|
||
fi
|
||
|
||
# 数据库文件
|
||
db_files=$(find . -name "*.db" -o -name "*.sqlite" -o -name "*.sqlite3" | head -5)
|
||
if [ ! -z "$db_files" ]; then
|
||
echo "📄 数据库文件:"
|
||
echo "$db_files"
|
||
fi
|
||
|
||
# 压缩文件
|
||
archive_files=$(find . -name "*.zip" -o -name "*.tar.gz" -o -name "*.rar" -o -name "*.7z" | head -5)
|
||
if [ ! -z "$archive_files" ]; then
|
||
echo "📄 压缩文件:"
|
||
echo "$archive_files"
|
||
fi
|
||
|
||
echo ""
|
||
echo "🎯 清理建议:"
|
||
echo "------------"
|
||
echo "1. 将大的编译文件、日志文件添加到.gitignore"
|
||
echo "2. 使用 'git rm --cached <file>' 移除已跟踪的大文件"
|
||
echo "3. 考虑使用Git LFS管理大的二进制文件"
|
||
echo "4. 定期清理临时文件和构建产物"
|
||
|
||
echo ""
|
||
echo "🔧 快速清理命令:"
|
||
echo "----------------"
|
||
echo "# 移除编译文件跟踪"
|
||
echo "find . -name '*.pyc' -exec git rm --cached {} \\; 2>/dev/null"
|
||
echo ""
|
||
echo "# 移除日志文件跟踪"
|
||
echo "find . -name '*.log' -exec git rm --cached {} \\; 2>/dev/null"
|
||
echo ""
|
||
echo "# 移除数据库文件跟踪"
|
||
echo "find . -name '*.db' -exec git rm --cached {} \\; 2>/dev/null"
|
||
|
||
echo ""
|
||
echo "✅ 扫描完成!"
|