Berikut variant “bulk” (dedupe_dir.py
) yang membersihkan *semua berkas .txt di satu folder sekaligus (opsional rekursif).
Ia mewarisi seluruh fitur skrip sebelumnya:
- ✅penghapusan duplikat
- ✅penghapusan baris terlalu panjang (
--max-length
, default 500) - ✅penghapusan baris rusak (URL di tengah baris)
- ✅tulis‐aman via file *.tmp ➔
os.replace()
- ✅dukungan
--backup
dan--encoding
#!/usr/bin/env python3
"""
dedupe_dir.py – Bersihkan banyak file *.txt di satu direktori.
Fitur:
• Hapus baris duplikat, baris panjang, dan baris rusak (http/https di tengah).
• Tulis aman (file .tmp, lalu os.replace()).
• Bekerja streaming → hemat memori.
• Opsional backup .bak.
• Rekursif (-r) atau hanya level atas direktori.
"""
from __future__ import annotations
import argparse, os, re, shutil, tempfile, time, locale, sys
from pathlib import Path
URL_RE = re.compile(r'https?://', re.I)
# ───────────────────────────── Util
def now() -> str:
return time.strftime('%H:%M:%S')
def clean_file(
path: Path,
*,
enc: str,
keep_backup: bool,
max_len: int,
) -> tuple[int, int, int]:
"""Bersihkan satu file .txt dan kembalikan (dup,long,bad)."""
seen: set[str] = set()
dup = long = bad = 0
with tempfile.NamedTemporaryFile(
mode='w',
encoding=enc,
errors='surrogateescape',
newline='',
delete=False,
dir=path.parent,
) as tmp, path.open('r', encoding=enc, errors='surrogateescape', newline='') as src:
for line in src:
if len(line.rstrip('\n\r')) > max_len:
long += 1
continue
m = URL_RE.search(line)
if m and not line.lstrip().startswith(m.group(0)):
bad += 1
continue
if line in seen:
dup += 1
continue
seen.add(line)
tmp.write(line)
if keep_backup:
shutil.copy2(path, path.with_suffix(path.suffix + '.bak'))
os.replace(tmp.name, path)
return dup, long, bad
# ───────────────────────────── CLI
def main() -> None:
p = argparse.ArgumentParser(
description='Bulk-remove duplicate / over-length / broken URL lines in every *.txt of a directory.'
)
p.add_argument('directory', type=Path, help='folder yang berisi berkas *.txt')
p.add_argument('-r', '--recursive', action='store_true', help='proses sub-folder juga')
p.add_argument('-e', '--encoding', default='utf-8', help='encoding file (default: utf-8)')
p.add_argument('-b', '--backup', action='store_true', help='buat .bak sebelum menimpa file')
p.add_argument('-m', '--max-length', type=int, default=500, help='panjang baris maksimal (default: 500)')
args = p.parse_args()
root: Path = args.directory
if not root.is_dir():
p.error(f'Bukan direktori: {root}')
# Kumpulkan daftar *.txt
files = list(root.rglob('*.txt') if args.recursive else root.glob('*.txt'))
if not files:
print(f'[{now()}] Tidak ada file .txt ditemukan di {root}')
sys.exit(0)
total_dup = total_long = total_bad = 0
for f in files:
try:
dup, long, bad = clean_file(
f, enc=args.encoding, keep_backup=args.backup, max_len=args.max_length
)
except UnicodeDecodeError:
sys_enc = locale.getpreferredencoding(False)
print(f'[{now()}] {f.name}: encoding "{args.encoding}" gagal. '
f'Fallback ke "{sys_enc}".')
dup, long, bad = clean_file(
f, enc=sys_enc, keep_backup=args.backup, max_len=args.max_length
)
print(f'[{now()}] {f.name}: -{dup} dup -{long} long -{bad} bad')
total_dup += dup
total_long += long
total_bad += bad
print(f'\n[{now()}] Selesai → {len(files)} file diproses')
print(f' • {total_dup} duplikat dihapus')
print(f' • {total_long} baris panjang dihapus')
print(f' • {total_bad} baris rusak dihapus')
if __name__ == '__main__':
main()
📄Cara pakai
# Bersihkan semua *.txt di folder ./log
python dedupe_dir.py ./log
# Termasuk sub-folder + backup file
python dedupe_dir.py ./log -r -b
# Panjang maksimum 300 karakter & encoding latin-1
python dedupe_dir.py ./log -m 300 -e latin-1
Catatan
- File yang sudah bersih akan tetap ditimpa (karena penulisan via file .tmp); jumlah baris terhapus akan tercatat 0.
- Kinerja baik untuk ribuan file besar karena setiap file diproses streaming satu-per-satu.