'Clean file line by line and write lines into a new file

I am new to Python and I want to clean a big file line by line and write clean lines into a new file. I think I have been able to read the file but for some reason, when I try to write the file, only the last line is written in the new file.

I do not really understand why that happens and I do not know if somebody could help me. The idea is to have the clean corpus in a new file. Here is my code:

import re
import nltk

##### BASQUE CORPUS CLEANING ONLY CONSISTS OF ONE STEP AS WE DO NOT SPEAK BASQUE #####

### DEFINITIONS
### Characters used in Basque: letters, numbers and orthographic marks
basque_characters = "A B C D E F G H I J K L M N Ñ O P Q R S T U V W X Y Z a b c d e f g h i j k l m n ñ o p q r s t u v w x y z 0 1 2 3 4 5 6 7 8 9 - — _ \ / \" ( ) [ ] { } * « » . , ; : ' ° º ª ² ³ ! ? & ¶ § € £ ¥ $ ¢ % = > < | @ # + → ʼ"
basque_characters_list = nltk.word_tokenize(basque_characters)
### Regex removers definition
tags_remover = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});') # Remove html tags
squarebrackets_remover = re.compile('\[.*?\]|\[\d*?\]') # Remove square brackets and their content
curlybrackets_remover = re.compile('\{.*?\}|\{\d*?\}') # Remove curly brackets and their content
brackets_remover = re.compile('\(.*?\)|\(\d*?\)') # Remove brackets and their content

##### RAW TEXT
### Read a file (string) and convert its content into a list of word tokens
with open(r"corpus.txt", 'r', encoding='utf-8') as raw_text:
    for line in raw_text:

### Find any lowercased Roman number from 1 to 3999 in the text and put it uppercase
        roman_lowercase = re.findall(r'(?=\b[mdclxvi]+\b)m{0,4}(?:cm|cd|d?c{0,3})(?:xc|xl|l?x{0,3})(?:ix|iv|v?i{0,3})', line) 
        line = re.sub(r'(?=\b[mdclxvi]+\b)m{0,4}(?:cm|cd|d?c{0,3})(?:xc|xl|l?x{0,3})(?:ix|iv|v?i{0,3})', lambda match: r'{}'.format(match.group().upper()), line)

##### TEXT CLEANING
# Convert file's content into a list of word tokens
        line_list = nltk.word_tokenize(line)

### Remove words containing non-Basque characters by doing matches between both lists and reconvert into a string
        clean_line_list = [item for item in line_list if any(x in item for x in basque_characters_list)] # Removal
        clean_line = ' '.join(clean_line_list) # String reconversion

### Use predefined regex removers to get rid of certain elements causing problems
        clean_line = re.sub(tags_remover, ' ', clean_line) # HTML tags
        clean_line = re.sub(squarebrackets_remover, ' ', clean_line) # Square brackets
        clean_line = re.sub(curlybrackets_remover, ' ', clean_line) # Curly brackets
        clean_line = re.sub(brackets_remover, ' ', clean_line) # Brackets

### Repair damaged format from (re)conversion and adapt some characters
        if '  ' or '¿ ' or ' ?' or '¡ ' or ' !'  or  ' .' or ' ,'or ' ;' or ' :' or ' @ ' or '# ' or ' ʼ ' or '« ' or ' »' or '"' or " '' " or ' `` ' in clean_line:
# Remove unnecessary spaces
            clean_line = clean_line.replace('  ','')
            clean_line = clean_line.replace('¿ ','¿')
            clean_line = clean_line.replace('¡ ','¡')
            clean_line = clean_line.replace(' ?','?')
            clean_line = clean_line.replace(' !','!')
            clean_line = clean_line.replace(' .','.')
            clean_line = clean_line.replace(' ,',',')
            clean_line = clean_line.replace(' ;',';')
            clean_line = clean_line.replace(' :',':')
            clean_line = clean_line.replace('# ','#')
            clean_line = clean_line.replace(' @ ','@')
            clean_line = clean_line.replace(' ʼ ','ʼ')
# Convert all sorts of quotes into single quotes understood by AD
            clean_line = clean_line.replace('« ',"'") 
            clean_line = clean_line.replace(' »',"'")
            clean_line = clean_line.replace('"',"'") 
            clean_line = clean_line.replace(" '' ","' ")
            clean_line = clean_line.replace(' `` '," '")
            clean_line = clean_line.replace('\xad','') # Remove soft hyphens. Other ways are '\u00ad' and '\N{SOFT HYPHEN}'

### Remove sentences containing certain problematic or infrequent Basque characters
        chopped_text = nltk.sent_tokenize(clean_line)
        for sentence in chopped_text:
            chopped_text = [sentence for sentence in chopped_text if ")" not in sentence] # Remaining closing parentheses
            chopped_text = [sentence for sentence in chopped_text if "&" not in sentence] # Por quitar los &amp que salían
            chopped_text = [sentence for sentence in chopped_text if "•" not in sentence] # Por quitar los bullet points pegados a palabras
            chopped_text = [sentence for sentence in chopped_text if "—" not in sentence] # ISO 8895-15 inexistent character: em dash 
            chopped_text = [sentence for sentence in chopped_text if "ʼ" not in sentence] # ISO 8895-15 inexistent character : apostroph
            chopped_text = [sentence for sentence in chopped_text if "¶" not in sentence] # AD problematic character: pilcrow
            chopped_text = [sentence for sentence in chopped_text if "§" not in sentence] # AD problematic character: section sign
            chopped_text = [sentence for sentence in chopped_text if "¢" not in sentence] # AD problematic character: cent sign
            chopped_text = [sentence for sentence in chopped_text if "#" not in sentence] # Not listed by RAE's Orthography: hash
            chopped_text = [sentence for sentence in chopped_text if "→" not in sentence] # ISO 8895-15 inexistent character: arrow
            chopped_text = [sentence for sentence in chopped_text if "_" not in sentence] # AD insufficient transcription: underscore
            chopped_text = [sentence for sentence in chopped_text if "/" not in sentence] # AD insufficient transcription: bar
            chopped_text = [sentence for sentence in chopped_text if "*" not in sentence] # AD erroneous transcription: asterisk
            chopped_text = [sentence for sentence in chopped_text if "<" not in sentence] # AD insufficient transcription: smaller
            chopped_text = [sentence for sentence in chopped_text if ">" not in sentence] # AD insufficient transcription: bigger
            chopped_text = [sentence for sentence in chopped_text if "|" not in sentence] # AD erroneous transcription: vertical bar
            chopped_text = [sentence for sentence in chopped_text if "@" not in sentence] # Not listed by RAE's Orthography: at

### WRITE NEW CLEAN FILE
with open(r"C:\Users\Usuario\Desktop\02-limpio.txt", 'w+', encoding='utf-8') as final_text:
    final_text.write('\n'.join(chopped_text))
final_text.close()

Any help would be really much appreciated.



Solution 1:[1]

Primary Issue

Your write statement was not inside the loop. It is inside the last with block (which was also not inside the loop).

    final_text.write('\n'.join(chopped_text))`

Thus it has written only the last line, i.e. chopped_text, to the file. Pay attention to indentation.

Clean way to fix it

import re
import nltk

### define functions in order of usage

### WRITE text to NEW CLEAN FILE
def write(text, path):
    with open(path, 'w+', encoding='utf-8') as out:
       final_text.write(text)

path_in = r"..."
path_out = r"C:\Users\Usuario\Desktop\02-limpio.txt"

# Read and transform the text
text = read_file(path_in)  # TODO: define the function
cleaned = clean(text)  # TODO: define the function
chopped = chop(cleaned)  # TODO: define the function like in example below

# Write the transformed text
write(chopped, path_out)

Note:

  • Make sure the input is a text (string). Thus join any lines before with '\n'.join(lines) (if lines is a list).
  • with cares for closing the file, so you don't need to do it explicitly with final_text.close().

Problem solving strategy

Try the problem solving strategy divide and conquer. This could be by splitting your implementation into functions which are easy to test and debug in isolation.

For example:

Function chop to remove or exclude sentences

import sys
import re
import nltk

### Remove sentences containing certain problematic or infrequent Basque characters
suspicious_chars = [ ")" # Remaining closing parentheses
, "&" # Por quitar los &amp que salían
, "•" # Por quitar los bullet points pegados a palabras
, "—" # ISO 8895-15 inexistent character: em dash 
, "?" # ISO 8895-15 inexistent character : apostroph
, "¶" # AD problematic character: pilcrow
, "§" # AD problematic character: section sign
, "¢" # AD problematic character: cent sign
, "#" # Not listed by RAE's Orthography: hash
, "?" # ISO 8895-15 inexistent character: arrow
, "_" # AD insufficient transcription: underscore
, "/" # AD insufficient transcription: bar
, "*" # AD erroneous transcription: asterisk
, "<" # AD insufficient transcription: smaller
, ">" # AD insufficient transcription: bigger
, "|" # AD erroneous transcription: vertical bar
, "@" # Not listed by RAE's Orthography: at
]
suspicious_pattern = f"[{''.join(suspicious_chars)}]"
suspicious_regex = re.compile(suspicious_pattern)

nltk.download('punkt')

def chop(text):
    sentences = nltk.sent_tokenize(text)
    filtered = [s for s in sentences if not suspicious_regex.match(s)]
    chopped = '\n'.join(filtered)
    return chopped

if __name__ == '__main__':
    if len(sys.argv) != 2:
       print('required file argument missing')
       exit(1)
    with open(sys.argv[1], mode='r', encoding='utf-8') as f:
        text = f.read()
        print("lines read", len(text.splitlines()))
        chopped = chop(text)
        print("lines chopped", len(chopped.splitlines()))

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 hc_dev