Problem
I have created simple code in which it takes flat file (txt) and eliminate the noise and error and only return clean rows, i.e. return flat file with the data I am interested in. it works fine but since I am a beginner I am sure my code is not efficient. I would appreciate if you can guide me into making it more professional and efficient – please be as brutal as you want, it will all add to my learning and development.
import pandas as pd
import numpy as np
def openFile(path, type):
file = open(path,type)
return file
def getFileDelim(f_path, f_del):
file = openFile(f_path, 'r')
lines = file.readlines()
cnt_list = np.array([0])
cnt_list = np.delete(cnt_list,0)
for i in range(100):
cnt_list = np.append(cnt_list,lines[i].count(f_del))
return np.bincount(cnt_list).argmax()
def getHeaders(f_path):
file_o = openFile(f_path.replace('.txt', '_ok.txt'), 'r')
file_o_ho = openFile(f_path.replace('.txt', '_ok_header_ok.txt'), 'w')
file_o_hn = openFile(f_path.replace('.txt', '_ok_header_noise.txt'), 'w')
lines = file_o.readlines()
cnt = 0
for line in lines:
if cnt == 0:
header = line
file_o_ho.write(line)
else:
if line != header:
file_o_ho.write(line)
else:
file_o_hn.write(line)
cnt += 1
def fileProcessing(f_path, f_del, del_cnt):
file = openFile(f_path, 'r')
file_o = openFile(f_path.replace('.txt', '_ok.txt'), 'w')
file_n = openFile(f_path.replace('.txt', '_noise.txt'), 'w')
file_e = openFile(f_path.replace('.txt', '_error.txt'), 'w')
lines = file.readlines()
for line in lines:
if line.count("|") == del_cnt:
file_o.write(line)
elif line.count("|") > 0:
file_n.write(line)
elif line.count("|") == 0:
file_e.write(line)
header_process = input('Do you want to analyse the header record of this file (y/n)?:')
if header_process.lower() == 'y' or header_process.lower() == 'yes':
getHeaders(f_path)
def generateReport(f_path, f_del, del_count):
output = "File Processing report:n"
output += "............................................................................................n"
output += "File Path: {}n".format(f_path)
output += "Delimeter: {}n".format(f_del)
output += "Delimeter Count: {}n".format(del_count)
output += "n"
file = openFile(f_path, 'r')
file_o = openFile(f_path.replace('.txt', '_ok.txt'), 'r')
file_n = openFile(f_path.replace('.txt', '_noise.txt'), 'r')
file_e = openFile(f_path.replace('.txt', '_error.txt'), 'r')
f_cnt = len(file.readlines())
f_o_cnt = len(file_o.readlines())
f_e_cnt = len(file_n.readlines())
f_n_cnt = len(file_e.readlines())
output += "Original File Line Count: {}n".format(f_cnt)
output += "Number of ok lines: {}n".format(f_o_cnt)
output += "Number of noise lines: {}n".format(f_n_cnt)
output += "Number of error lines: {}n".format(f_e_cnt)
output += "Total number of lines assessed: {}n".format(f_o_cnt + f_n_cnt + f_e_cnt)
file_r = openFile(f_path.replace('.txt', '_report.txt'), 'w')
file_r.write(output)
return output
def main (f_path,f_del,del_cnt = None):
if del_cnt == None:
del_cnt = getFileDelim(f_path, f_del)
fileProcessing(f_path, f_del, del_cnt)
print(generateReport(f_path, f_del, del_cnt))
f_path = input("Please provide file path:").replace('"','')
f_del = input("Provide the delimeter:")
main(f_path,f_del)
The raw file i will be processing is like below picture
15.04.2021 Lieferungen mit Auftragsdaten 1
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Lieferungen mit Auftragsdaten
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|Lieferung | Pos|Angel.am |VStl|LFArt|Werk|Warenempf.|IstWA Dat |Material |Materialnummer |Liefermenge|ME |MS|Auftragsart|Aufragsdatum|Auftragsmenge|ME |VB|Werk|Lieferung |Verkaufsb.|
|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|9888477351|000010|24.01.2020|LC01|ZLSO |0156|V00000DEB7|24.01.2020|01.93105-9780|RO M3313-C-E235+N-M3574-B21 20X2 HL 6000 | 18 |M |35|ZSO1 |20.01.2020 | 18 |M |01|0156|9888477351|214898993 |
|9888401282|000010|11.01.2020|LC03|ZLSO |0156|V00000CH01|13.01.2020|04.33335-9950|BODENBELAG PVC-MAN323-M-2000-F5251A-GAYA MOSAIC NT4482 BABEL RO24M | 24 |M |35|ZSO1 |20.12.2019 | 24 |M |01|0156|9888401282|214805942 |
|9888437256|000070|17.01.2020|LC01|ZLTA |0156|V00000DE33|20.01.2020|04.38235-9315|SCHWEISSCHNUR PVC FRAN RO100M DUNKELGRAU | 40 |M |35|ZTA1 |16.01.2020 | 40 |M |02|0156|9888437256|214888602 |
|9888363103|000010|06.01.2020|LC01|ZLSO |0156|V00000DE78|06.01.2020|04.38235-9315|SCHWEISSCHNUR PVC FRAN RO100M DUNKELGRAU | 20 |M |35|ZSO1 |06.01.2020 | 20 |M |01|0156|9888363103|214834613 |
|9888411482|000010|14.01.2020|LC03|ZLSO |0156|V008005037|14.01.2020|33.02640-0034|HALTER | 1 |ST |35|ZSO1 |18.12.2019 | 1 |ST |01|0156|9888411482|214795849 |
|9888470166|000010|23.01.2020|LC01|ZLTA |0156|V00000NO00|23.01.2020|33.25140-0010|HALTER RE | 1 |ST |35|ZTA1 |25.11.2019 | 1 |ST |02|0156|9888470166|214696718 |
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
15.04.2021 Lieferungen mit Auftragsdaten 2
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|Lieferung | Pos|Angel.am |VStl|LFArt|Werk|Warenempf.|IstWA Dat |Material |Materialnummer |Liefermenge|ME |MS|Auftragsart|Aufragsdatum|Auftragsmenge|ME |VB|Werk|Lieferung |Verkaufsb.|
|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|9888502930|000010|29.01.2020|LC01|ZLSO |0156|V000050401|29.01.2020|81.25456-6921|ZSB KABELSTRANG FHS ZUSATZHEIZUNG D5W OHNE TRS | 1 |ST |35|ZSO1 |16.12.2019 | 1 |ST |01|0156|9888502930|214785306 |
|9888503710|000010|29.01.2020|LC01|ZLTA |0156|V00000AT29|29.01.2020|81.25456-6982|ZSB KABELSTRANG RAHMEN LINKS BEI EBS-5 | 1 |ST |35|ZTA1 |11.12.2019 | 1 |ST |02|0156|9888503710|214764484 |
|9888453594|000010|21.01.2020|LC03|ZLSO |0156|V00000GB08|21.01.2020|81.25459-5207|ZSB KABELSTRANG SCHEINWERFER LI RANGIERLEUCHTE | 1 |ST |35|ZSO1 |12.12.2019 | 1 |ST |01|0156|9888453594|214774049 |
|9888477288|000010|24.01.2020|LC03|ZLSO |0156|V00000GB17|24.01.2020|81.25459-7711|ZSB KABELSTRANG FHS WINTERDIENSTBELEUCHTUNG | 1 |ST |35|ZSO1 |05.12.2019 | 1 |ST |01|0156|9888477288|214741051 |
|9888485462|000010|25.01.2020|LC03|ZLTA |0156|V00000GB17|27.01.2020|81.25459-7711|ZSB KABELSTRANG FHS WINTERDIENSTBELEUCHTUNG | 1 |ST |35|ZTA1 |20.12.2019 | 1 |ST |02|0156|9888485462|214805832 |
Solution
Soooo. First, this has nothing to do with Pandas (you included the library but then didn’t use it); and your use of Numpy is dubious. I’m going to suggest that you skip Numpy altogether.
A brief laundry list of things that should change:
- Use lower_snake_case for function and variable names
- Consider adding PEP484 type hints
- Even if you did need numpy, which you don’t, it doesn’t make sense to populate and then delete an array – just call
np.empty((0,))
for i in range(100)
will crash for any file less than that many lines- You don’t actually need to successive-append to an entire numpy array – if you did need to successive-append, Python lists are often faster for that; but you don’t need either. You can just apply a running
max
. .replace('.txt'
has a very funny bug. Since you assume that all files end in.txt
, any file that is passed in with a different extension will be trampled due to a skipped replacement and subsequent overwrite. Usepathlib
and stems instead.- Generally having to check for an index on the inside of a loop, as in
getHeaders
, is a code smell; and that’s true here. Easier to represent the file as an iterator and do a singlenext()
to get the first line. - You’ve asked for a separator and then mostly ignored it and assumed a pipe instead.
- An easier way to tee output to stdout and a file is to use real logging.
- Use
is None
instead of== None
- Use a main guard
The following is one way of accomplishing the above, and enforcing that the file only needs to be opened once, only needs to be traversed once, and does not need to be kept in memory all at the same time. If the file is small enough (say 1GB or less), it would probably be faster to use a different algorithm that loads all lines into memory, which this does not do. Anyway:
Suggested
from functools import partial
from itertools import islice, tee
from logging import getLogger, StreamHandler, FileHandler, INFO
from pathlib import Path
from typing import Iterable, TextIO, Callable
def setup_logger():
logger = getLogger('clean')
logger.addHandler(StreamHandler())
logger.addHandler(FileHandler(filename='report.txt'))
logger.setLevel(INFO)
return logger
logger = setup_logger()
def get_delim_counts(file: TextIO, delim: str, n_lines: int = 100) -> Iterable[int]:
for line in islice(file, n_lines):
yield line.count(delim)
file.seek(0)
def match_output(
orig_path: Path,
lines: Iterable[str],
stem: str,
delim: str,
delim_pred: Callable[[int], bool],
) -> Iterable:
n = 0
path = orig_path.with_stem(f'{orig_path.stem}_{stem}')
with path.open('w') as f:
for line in lines:
n_delim = line.count(delim)
matched = delim_pred(n_delim)
if matched:
f.write(line)
n += int(matched)
yield
logger.info(f'Number of {stem} lines: {n}')
def match_header(
orig_path: Path,
lines: Iterable[str],
):
ok_path = orig_path.with_stem(f'{orig_path.stem}_ok_header_ok')
noise_path = orig_path.with_stem(f'{orig_path.stem}_ok_header_noise')
with ok_path.open('w') as ok_file,
noise_path.open('w') as noise_file:
header = next(lines)
ok_file.write(header)
for line in lines:
if line == header:
noise_file.write(line)
else:
ok_file.write(line)
yield
def process(path: Path, delim: str, process_header: bool) -> None:
logger.info(
'File Processing Report:n'
f'File path: {path.absolute()}n'
f'Delimiter: {delim}'
)
with path.open() as orig:
delim_max = max(get_delim_counts(orig, delim))
logger.info(f'Delimiter count: {delim_max}')
match_path = partial(match_output, orig_path=path, delim=delim)
process_funs = [
partial(
match_path, stem='ok', delim_pred=lambda n: n == delim_max,
),
partial(
match_path, stem='noise', delim_pred=lambda n: 0 < n < delim_max,
),
partial(
match_path, stem='error', delim_pred=lambda n: n < 1,
),
]
if process_header:
process_funs.append(
partial(match_header, orig_path=path)
)
iters = [
fun(lines=lines)
for fun, lines in zip(
process_funs, tee(orig, len(process_funs))
)
]
for n_lines, _ in enumerate(zip(*iters)):
pass # could put a progress bar here
logger.info(f'Original file line count: {n_lines}')
def main():
process(
path=Path(input('Provide file path: ')),
delim=input('Provide the delimiter: '),
process_header=input(
'Do you want to analyse the header record of this file (y/n)?: '
).lower().startswith('y'),
)
if __name__ == '__main__':
main()