'out of memory error when reading file with json.loads in python

hi i create a file and fill it with below code:

i have some lists which i fill them and write them in file.

each list will be written in one line.

import json
import random
import numpy as np

train_dir = '../sq-new/data/0-train.json'
outputdata="./data/1-train-with-50-random-embedded-neg-sample.json"

vec = np.memmap('../sq-new/data/entity2vec.bin', dtype='float32', mode='r',offset=0)

out_dataset = open(outputdata,'w', encoding='utf-8')

data_train = open(train_dir, "r")

 

question_train=[]

subject_pos_train=[]

subject_neg_train=[]

subject_pos_m=[]

subject_neg_m=[]

answer_pos_train=[]

answer_neg_train=[]

answer_pos_m=[]

answer_neg_m=[]

for line in data_train:


    line=json.loads(line)

    true_triples=[]
    false_triples=[]

    for triple in line['triples']:

        if triple['ans']==True:

            true_triples.append(triple)
        
        elif triple['ans']==False:

            false_triples.append(triple)

    pair_count=0

    if (len(true_triples)>0) and (len(false_triples)>0):

        while pair_count<100:

            true_sample=random.choice(true_triples)

            false_sample=random.choice(false_triples)

            question_train.append(line['question'])

            subject_neg_train.append(false_sample['q_et'])

            subject_pos_train.append(true_sample['q_et'])

            answer_neg_train.append(false_sample['c_et'])

            answer_pos_train.append(true_sample['c_et'])

            i=int(false_sample['cet_id'])
            answer_neg_m.append(list(vec[(i*50):((i+1)*50)]))

            i=int(true_sample['cet_id'])
            answer_pos_m.append(list(vec[(i*50):((i+1)*50)]))

            i=int(false_sample['q_et_id'])
            subject_neg_m.append(list(vec[(i*50):((i+1)*50)]))


            i=int(true_sample['q_et_id'])
            subject_pos_m.append(list(vec[(i*50):((i+1)*50)]))

            pair_count=pair_count+1

out_dataset.write(json.dumps(question_train)+"\n")
out_dataset.write(json.dumps(subject_neg_train)+"\n")
out_dataset.write(json.dumps(subject_pos_train)+"\n")
out_dataset.write(json.dumps(answer_neg_train)+"\n")
out_dataset.write(json.dumps(answer_pos_train)+"\n")
out_dataset.write(json.dumps(str(answer_neg_m))+"\n")
out_dataset.write(json.dumps(str(answer_pos_m))+"\n")
out_dataset.write(json.dumps(str(subject_neg_m))+"\n")
out_dataset.write(json.dumps(str(subject_pos_m))+"\n")
        
out_dataset.close()
 

result file size is 16GB

when i want to read the file my memory will be fulled completely and code will be killed. my server has 250GB RAM

here is my code for reading.

so whats wrong with my codes.

import ast
import json
train_dir = '../create-data/data/1-train-with-50-random-embedded-neg-sample.json'
# train_dir = '../create-data/data/1-train-one-embeded.json'
data_train = open(train_dir, "r")
question_train=json.loads(data_train.readline())
subject_neg_train=json.loads(data_train.readline())
subject_pos_train=json.loads(data_train.readline())
answer_neg_train=json.loads(data_train.readline())
answer_pos_train=json.loads(data_train.readline())
answer_neg_m=ast.literal_eval(json.loads(data_train.readline()))
answer_pos_m=ast.literal_eval(json.loads(data_train.readline()))
subject_neg_m=ast.literal_eval(json.loads(data_train.readline()))
subject_pos_m=ast.literal_eval(json.loads(data_train.readline()))

print(type(subject_neg_train))
print(len(subject_neg_train))


print(type(subject_pos_m))
print(len(subject_pos_m))
print(type(subject_pos_m[0]))



Solution 1:[1]

Try pandas

import pandas as pd
df = pd.read_json(train_dir)

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 Mehran Rahmanzadeh