Bulk Remove Duplicate Line

Berikut variant “bulk” (dedupe_dir.py) yang membersihkan *semua berkas .txt di satu folder sekaligus (opsional rekursif).
Ia mewarisi seluruh fitur skrip sebelumnya:

✅penghapusan duplikat
✅penghapusan baris terlalu panjang (--max-length, default 500)
✅penghapusan baris rusak (URL di tengah baris)
✅tulis‐aman via file *.tmp ➔ os.replace()
✅dukungan --backup dan --encoding

#!/usr/bin/env python3
"""
dedupe_dir.py – Bersihkan banyak file *.txt di satu direktori.

Fitur:
• Hapus baris duplikat, baris panjang, dan baris rusak (http/https di tengah).
• Tulis aman (file .tmp, lalu os.replace()).
• Bekerja streaming → hemat memori.
• Opsional backup .bak.
• Rekursif (-r) atau hanya level atas direktori.
"""

from __future__ import annotations
import argparse, os, re, shutil, tempfile, time, locale, sys
from pathlib import Path

URL_RE = re.compile(r'https?://', re.I)

# ───────────────────────────── Util
def now() -> str:
    return time.strftime('%H:%M:%S')

def clean_file(
    path: Path,
    *,
    enc: str,
    keep_backup: bool,
    max_len: int,
) -> tuple[int, int, int]:
    """Bersihkan satu file .txt dan kembalikan (dup,long,bad)."""
    seen: set[str] = set()
    dup = long = bad = 0

    with tempfile.NamedTemporaryFile(
        mode='w',
        encoding=enc,
        errors='surrogateescape',
        newline='',
        delete=False,
        dir=path.parent,
    ) as tmp, path.open('r', encoding=enc, errors='surrogateescape', newline='') as src:

        for line in src:
            if len(line.rstrip('\n\r')) > max_len:
                long += 1
                continue
            m = URL_RE.search(line)
            if m and not line.lstrip().startswith(m.group(0)):
                bad += 1
                continue
            if line in seen:
                dup += 1
                continue
            seen.add(line)
            tmp.write(line)

    if keep_backup:
        shutil.copy2(path, path.with_suffix(path.suffix + '.bak'))
    os.replace(tmp.name, path)
    return dup, long, bad

# ───────────────────────────── CLI
def main() -> None:
    p = argparse.ArgumentParser(
        description='Bulk-remove duplicate / over-length / broken URL lines in every *.txt of a directory.'
    )
    p.add_argument('directory', type=Path, help='folder yang berisi berkas *.txt')
    p.add_argument('-r', '--recursive', action='store_true', help='proses sub-folder juga')
    p.add_argument('-e', '--encoding', default='utf-8', help='encoding file (default: utf-8)')
    p.add_argument('-b', '--backup', action='store_true', help='buat .bak sebelum menimpa file')
    p.add_argument('-m', '--max-length', type=int, default=500, help='panjang baris maksimal (default: 500)')
    args = p.parse_args()

    root: Path = args.directory
    if not root.is_dir():
        p.error(f'Bukan direktori: {root}')

    # Kumpulkan daftar *.txt
    files = list(root.rglob('*.txt') if args.recursive else root.glob('*.txt'))
    if not files:
        print(f'[{now()}] Tidak ada file .txt ditemukan di {root}')
        sys.exit(0)

    total_dup = total_long = total_bad = 0

    for f in files:
        try:
            dup, long, bad = clean_file(
                f, enc=args.encoding, keep_backup=args.backup, max_len=args.max_length
            )
        except UnicodeDecodeError:
            sys_enc = locale.getpreferredencoding(False)
            print(f'[{now()}] {f.name}: encoding "{args.encoding}" gagal. '
                  f'Fallback ke "{sys_enc}".')
            dup, long, bad = clean_file(
                f, enc=sys_enc, keep_backup=args.backup, max_len=args.max_length
            )
        print(f'[{now()}] {f.name}: -{dup} dup  -{long} long  -{bad} bad')
        total_dup   += dup
        total_long  += long
        total_bad   += bad

    print(f'\n[{now()}] Selesai → {len(files)} file diproses')
    print(f'  • {total_dup} duplikat dihapus')
    print(f'  • {total_long} baris panjang dihapus')
    print(f'  • {total_bad} baris rusak dihapus')

if __name__ == '__main__':
    main()

📄Cara pakai

# Bersihkan semua *.txt di folder ./log
python dedupe_dir.py ./log

# Termasuk sub-folder + backup file
python dedupe_dir.py ./log -r -b

# Panjang maksimum 300 karakter & encoding latin-1
python dedupe_dir.py ./log -m 300 -e latin-1

Catatan

File yang sudah bersih akan tetap ditimpa (karena penulisan via file .tmp); jumlah baris terhapus akan tercatat 0.
Kinerja baik untuk ribuan file besar karena setiap file diproses streaming satu-per-satu.

Press ESC to close

Share Article:

Remove Duplicate Line

URL Scraper by Python