✅ Menghapus baris duplikat
✅ Menghapus baris yang terlalu panjang, dengan panjang maksimum yang bisa kamu atur (default: 500 karakter)
✅ Menangani karakter non-UTF-8 dengan aman
✅Baris “rusak” yang memuat http://
atau https://
bukan di awal baris juga dibuang (diasumsikan hasil gabungan tanpa newline).
✅ Hemat memori (streaming)
✅ Opsi untuk backup file asli sebelum diganti (tulis ke berkas .tmp
→ os.replace()
), hemat RAM (streaming), suport karakter “aneh”, dan ada opsi backup.
📄 Script lengkap
#!/usr/bin/env python3
"""
dedupe.py – Hapus baris duplikat, baris terlalu panjang, dan baris rusak
yang memuat http/https di posisi salah, dengan cara aman & hemat memori.
"""
from __future__ import annotations
import argparse, os, re, shutil, tempfile, time, locale
from pathlib import Path
# ────────────────── Util ──────────────────
def now() -> str:
return time.strftime('%H:%M:%S')
# Pola URL sederhana: http:// atau https://
URL_RE = re.compile(r'https?://', re.I)
# ────────────────── Core ──────────────────
def process(
path: Path, *,
enc: str,
keep_backup: bool,
max_len: int,
) -> tuple[int, int, int]:
"""
Streaming-proses file:
• Return (dup_count, long_count, broken_url_count)
"""
seen: set[str] = set()
dup_count = long_count = broken_url_count = 0
with tempfile.NamedTemporaryFile(
mode='w',
encoding=enc,
errors='surrogateescape',
newline='',
delete=False,
dir=path.parent,
) as tmp, path.open('r', encoding=enc, errors='surrogateescape', newline='') as src:
for line in src:
# 1) Buang baris terlalu panjang
if len(line.rstrip('\n\r')) > max_len:
long_count += 1
continue
# 2) Buang baris rusak (URL di tengah)
m = URL_RE.search(line)
if m and not line.lstrip().startswith(m.group(0)):
broken_url_count += 1
continue
# 3) Buang duplikat
if line in seen:
dup_count += 1
continue
seen.add(line)
tmp.write(line)
if keep_backup:
shutil.copy2(path, path.with_suffix(path.suffix + '.bak'))
os.replace(tmp.name, path)
return dup_count, long_count, broken_url_count
# ────────────────── CLI ──────────────────
def main() -> None:
p = argparse.ArgumentParser(
description=('Safely remove duplicate lines, over-length lines, and '
'lines where http/https appears in the middle.')
)
p.add_argument('file', type=Path, help='text file to clean')
p.add_argument('-e', '--encoding', default='utf-8',
help='file encoding (default: utf-8)')
p.add_argument('-b', '--backup', action='store_true',
help='save original as .bak before replacing')
p.add_argument('-m', '--max-length', type=int, default=500,
help='maximum allowed line length (default: 500)')
args = p.parse_args()
if not args.file.is_file():
p.error(f'File not found: {args.file}')
try:
dup, long, bad = process(
args.file,
enc=args.encoding,
keep_backup=args.backup,
max_len=args.max_length,
)
except UnicodeDecodeError:
sys_enc = locale.getpreferredencoding(False)
print(f'[{now()}] Encoding "{args.encoding}" gagal. '
f'Mencoba fallback "{sys_enc}".')
dup, long, bad = process(
args.file,
enc=sys_enc,
keep_backup=args.backup,
max_len=args.max_length,
)
print(f'[{now()}] {dup} duplikat, {long} baris panjang, '
f'{bad} baris rusak dibuang dari {args.file.name}.')
if __name__ == '__main__':
main()
# Paling sederhana
python dedupe.py data.txt
# Atur panjang maksimum 300 char & buat backup
python dedupe.py data.txt -m 300 -b
# File Latin-1
python dedupe.py data.txt -e latin-1