Python utility to clean noise from flat text files

Posted on

Problem

I have created simple code in which it takes flat file (txt) and eliminate the noise and error and only return clean rows, i.e. return flat file with the data I am interested in. it works fine but since I am a beginner I am sure my code is not efficient. I would appreciate if you can guide me into making it more professional and efficient – please be as brutal as you want, it will all add to my learning and development.

import pandas as pd
import numpy as np


def openFile(path, type):
    file = open(path,type)
    return file

def getFileDelim(f_path, f_del):
    file = openFile(f_path, 'r')
    lines = file.readlines()
    cnt_list = np.array([0])
    cnt_list = np.delete(cnt_list,0)

    for i in range(100):  
        cnt_list = np.append(cnt_list,lines[i].count(f_del))
    return np.bincount(cnt_list).argmax()

def getHeaders(f_path):
    file_o = openFile(f_path.replace('.txt', '_ok.txt'), 'r')
    file_o_ho = openFile(f_path.replace('.txt', '_ok_header_ok.txt'), 'w')
    file_o_hn = openFile(f_path.replace('.txt', '_ok_header_noise.txt'), 'w')

    lines = file_o.readlines()
    
    cnt = 0
    for line in lines:
        if cnt == 0:
           header = line
           file_o_ho.write(line)
        else:
            if line != header:
                file_o_ho.write(line)
            else:
                file_o_hn.write(line)
        cnt += 1





def fileProcessing(f_path, f_del, del_cnt):
    file = openFile(f_path, 'r')
    file_o = openFile(f_path.replace('.txt', '_ok.txt'), 'w')
    file_n = openFile(f_path.replace('.txt', '_noise.txt'), 'w')
    file_e = openFile(f_path.replace('.txt', '_error.txt'), 'w')
    
    lines = file.readlines()

    for line in lines:    
        if line.count("|") == del_cnt:
            file_o.write(line)
        elif line.count("|") > 0:
            file_n.write(line)
        elif line.count("|") == 0:
            file_e.write(line)

    header_process = input('Do you want to analyse the header record of this file (y/n)?:')
    if header_process.lower() == 'y' or header_process.lower() == 'yes':
        getHeaders(f_path)     

def generateReport(f_path, f_del, del_count):
    output = "File Processing report:n"
    output += "............................................................................................n"    
    output += "File Path: {}n".format(f_path)
    output += "Delimeter: {}n".format(f_del)
    output += "Delimeter Count: {}n".format(del_count)

    output += "n"


    file = openFile(f_path, 'r')
    file_o = openFile(f_path.replace('.txt', '_ok.txt'), 'r')
    file_n = openFile(f_path.replace('.txt', '_noise.txt'), 'r')
    file_e = openFile(f_path.replace('.txt', '_error.txt'), 'r')

    f_cnt = len(file.readlines())
    f_o_cnt = len(file_o.readlines())
    f_e_cnt = len(file_n.readlines())
    f_n_cnt = len(file_e.readlines())
    output += "Original File Line Count: {}n".format(f_cnt)
    output += "Number of ok lines: {}n".format(f_o_cnt)
    output += "Number of noise lines: {}n".format(f_n_cnt)
    output += "Number of error lines: {}n".format(f_e_cnt)
    output += "Total number of lines assessed: {}n".format(f_o_cnt + f_n_cnt + f_e_cnt)

    file_r = openFile(f_path.replace('.txt', '_report.txt'), 'w') 
    file_r.write(output)   

    return output

def main (f_path,f_del,del_cnt = None):
    if del_cnt == None:
        del_cnt = getFileDelim(f_path, f_del)
    fileProcessing(f_path, f_del, del_cnt)
    print(generateReport(f_path, f_del, del_cnt))

f_path = input("Please provide file path:").replace('"','')
f_del = input("Provide the delimeter:")


main(f_path,f_del)

The raw file i will be processing is like below picture
enter image description here

15.04.2021                                                                                                     Lieferungen mit Auftragsdaten                                                                                                        1
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Lieferungen mit Auftragsdaten
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|Lieferung |   Pos|Angel.am  |VStl|LFArt|Werk|Warenempf.|IstWA Dat |Material     |Materialnummer                                                          |Liefermenge|ME |MS|Auftragsart|Aufragsdatum|Auftragsmenge|ME |VB|Werk|Lieferung |Verkaufsb.|
|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|9888477351|000010|24.01.2020|LC01|ZLSO |0156|V00000DEB7|24.01.2020|01.93105-9780|RO  M3313-C-E235+N-M3574-B21 20X2 HL  6000                              |        18 |M  |35|ZSO1       |20.01.2020  |          18 |M  |01|0156|9888477351|214898993 |
|9888401282|000010|11.01.2020|LC03|ZLSO |0156|V00000CH01|13.01.2020|04.33335-9950|BODENBELAG  PVC-MAN323-M-2000-F5251A-GAYA MOSAIC NT4482 BABEL RO24M     |        24 |M  |35|ZSO1       |20.12.2019  |          24 |M  |01|0156|9888401282|214805942 |
|9888437256|000070|17.01.2020|LC01|ZLTA |0156|V00000DE33|20.01.2020|04.38235-9315|SCHWEISSCHNUR  PVC FRAN RO100M DUNKELGRAU                               |        40 |M  |35|ZTA1       |16.01.2020  |          40 |M  |02|0156|9888437256|214888602 |
|9888363103|000010|06.01.2020|LC01|ZLSO |0156|V00000DE78|06.01.2020|04.38235-9315|SCHWEISSCHNUR  PVC FRAN RO100M DUNKELGRAU                               |        20 |M  |35|ZSO1       |06.01.2020  |          20 |M  |01|0156|9888363103|214834613 |
|9888411482|000010|14.01.2020|LC03|ZLSO |0156|V008005037|14.01.2020|33.02640-0034|HALTER                                                                  |         1 |ST |35|ZSO1       |18.12.2019  |           1 |ST |01|0156|9888411482|214795849 |
|9888470166|000010|23.01.2020|LC01|ZLTA |0156|V00000NO00|23.01.2020|33.25140-0010|HALTER  RE                                                              |         1 |ST |35|ZTA1       |25.11.2019  |           1 |ST |02|0156|9888470166|214696718 |

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
15.04.2021                                                                                                     Lieferungen mit Auftragsdaten                                                                                                        2
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|Lieferung |   Pos|Angel.am  |VStl|LFArt|Werk|Warenempf.|IstWA Dat |Material     |Materialnummer                                                          |Liefermenge|ME |MS|Auftragsart|Aufragsdatum|Auftragsmenge|ME |VB|Werk|Lieferung |Verkaufsb.|
|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|9888502930|000010|29.01.2020|LC01|ZLSO |0156|V000050401|29.01.2020|81.25456-6921|ZSB KABELSTRANG  FHS ZUSATZHEIZUNG D5W OHNE TRS                         |         1 |ST |35|ZSO1       |16.12.2019  |           1 |ST |01|0156|9888502930|214785306 |
|9888503710|000010|29.01.2020|LC01|ZLTA |0156|V00000AT29|29.01.2020|81.25456-6982|ZSB KABELSTRANG  RAHMEN LINKS BEI EBS-5                                 |         1 |ST |35|ZTA1       |11.12.2019  |           1 |ST |02|0156|9888503710|214764484 |
|9888453594|000010|21.01.2020|LC03|ZLSO |0156|V00000GB08|21.01.2020|81.25459-5207|ZSB KABELSTRANG  SCHEINWERFER LI RANGIERLEUCHTE                         |         1 |ST |35|ZSO1       |12.12.2019  |           1 |ST |01|0156|9888453594|214774049 |
|9888477288|000010|24.01.2020|LC03|ZLSO |0156|V00000GB17|24.01.2020|81.25459-7711|ZSB KABELSTRANG  FHS WINTERDIENSTBELEUCHTUNG                            |         1 |ST |35|ZSO1       |05.12.2019  |           1 |ST |01|0156|9888477288|214741051 |
|9888485462|000010|25.01.2020|LC03|ZLTA |0156|V00000GB17|27.01.2020|81.25459-7711|ZSB KABELSTRANG  FHS WINTERDIENSTBELEUCHTUNG                            |         1 |ST |35|ZTA1       |20.12.2019  |           1 |ST |02|0156|9888485462|214805832 |

Solution

Soooo. First, this has nothing to do with Pandas (you included the library but then didn’t use it); and your use of Numpy is dubious. I’m going to suggest that you skip Numpy altogether.

A brief laundry list of things that should change:

  • Use lower_snake_case for function and variable names
  • Consider adding PEP484 type hints
  • Even if you did need numpy, which you don’t, it doesn’t make sense to populate and then delete an array – just call np.empty((0,))
  • for i in range(100) will crash for any file less than that many lines
  • You don’t actually need to successive-append to an entire numpy array – if you did need to successive-append, Python lists are often faster for that; but you don’t need either. You can just apply a running max.
  • .replace('.txt' has a very funny bug. Since you assume that all files end in .txt, any file that is passed in with a different extension will be trampled due to a skipped replacement and subsequent overwrite. Use pathlib and stems instead.
  • Generally having to check for an index on the inside of a loop, as in getHeaders, is a code smell; and that’s true here. Easier to represent the file as an iterator and do a single next() to get the first line.
  • You’ve asked for a separator and then mostly ignored it and assumed a pipe instead.
  • An easier way to tee output to stdout and a file is to use real logging.
  • Use is None instead of == None
  • Use a main guard

The following is one way of accomplishing the above, and enforcing that the file only needs to be opened once, only needs to be traversed once, and does not need to be kept in memory all at the same time. If the file is small enough (say 1GB or less), it would probably be faster to use a different algorithm that loads all lines into memory, which this does not do. Anyway:

Suggested

from functools import partial
from itertools import islice, tee
from logging import getLogger, StreamHandler, FileHandler, INFO
from pathlib import Path
from typing import Iterable, TextIO, Callable


def setup_logger():
    logger = getLogger('clean')
    logger.addHandler(StreamHandler())
    logger.addHandler(FileHandler(filename='report.txt'))
    logger.setLevel(INFO)
    return logger


logger = setup_logger()


def get_delim_counts(file: TextIO, delim: str, n_lines: int = 100) -> Iterable[int]:
    for line in islice(file, n_lines):
        yield line.count(delim)
    file.seek(0)


def match_output(
    orig_path: Path,
    lines: Iterable[str],
    stem: str,
    delim: str,
    delim_pred: Callable[[int], bool],
) -> Iterable:
    n = 0
    path = orig_path.with_stem(f'{orig_path.stem}_{stem}')

    with path.open('w') as f:
        for line in lines:
            n_delim = line.count(delim)
            matched = delim_pred(n_delim)
            if matched:
                f.write(line)

            n += int(matched)
            yield

    logger.info(f'Number of {stem} lines: {n}')


def match_header(
    orig_path: Path,
    lines: Iterable[str],
):
    ok_path = orig_path.with_stem(f'{orig_path.stem}_ok_header_ok')
    noise_path = orig_path.with_stem(f'{orig_path.stem}_ok_header_noise')

    with ok_path.open('w') as ok_file, 
        noise_path.open('w') as noise_file:

        header = next(lines)
        ok_file.write(header)

        for line in lines:
            if line == header:
                noise_file.write(line)
            else:
                ok_file.write(line)
            yield


def process(path: Path, delim: str, process_header: bool) -> None:
    logger.info(
        'File Processing Report:n'
        f'File path: {path.absolute()}n'
        f'Delimiter: {delim}'
    )

    with path.open() as orig:
        delim_max = max(get_delim_counts(orig, delim))
        logger.info(f'Delimiter count: {delim_max}')
        match_path = partial(match_output, orig_path=path, delim=delim)

        process_funs = [
            partial(
                match_path, stem='ok', delim_pred=lambda n: n == delim_max,
            ),
            partial(
                match_path, stem='noise', delim_pred=lambda n: 0 < n < delim_max,
            ),
            partial(
                match_path, stem='error', delim_pred=lambda n: n < 1,
            ),
        ]

        if process_header:
            process_funs.append(
                partial(match_header, orig_path=path)
            )

        iters = [
            fun(lines=lines)
            for fun, lines in zip(
                process_funs, tee(orig, len(process_funs))
            )
        ]

        for n_lines, _ in enumerate(zip(*iters)):
            pass  # could put a progress bar here
    logger.info(f'Original file line count: {n_lines}')


def main():
    process(
        path=Path(input('Provide file path: ')),
        delim=input('Provide the delimiter: '),
        process_header=input(
            'Do you want to analyse the header record of this file (y/n)?: '
        ).lower().startswith('y'),
    )


if __name__ == '__main__':
    main()

Leave a Reply

Your email address will not be published. Required fields are marked *