'out of memory error when reading file with json.loads in python
hi i create a file and fill it with below code:
i have some lists which i fill them and write them in file.
each list will be written in one line.
import json
import random
import numpy as np
train_dir = '../sq-new/data/0-train.json'
outputdata="./data/1-train-with-50-random-embedded-neg-sample.json"
vec = np.memmap('../sq-new/data/entity2vec.bin', dtype='float32', mode='r',offset=0)
out_dataset = open(outputdata,'w', encoding='utf-8')
data_train = open(train_dir, "r")
question_train=[]
subject_pos_train=[]
subject_neg_train=[]
subject_pos_m=[]
subject_neg_m=[]
answer_pos_train=[]
answer_neg_train=[]
answer_pos_m=[]
answer_neg_m=[]
for line in data_train:
line=json.loads(line)
true_triples=[]
false_triples=[]
for triple in line['triples']:
if triple['ans']==True:
true_triples.append(triple)
elif triple['ans']==False:
false_triples.append(triple)
pair_count=0
if (len(true_triples)>0) and (len(false_triples)>0):
while pair_count<100:
true_sample=random.choice(true_triples)
false_sample=random.choice(false_triples)
question_train.append(line['question'])
subject_neg_train.append(false_sample['q_et'])
subject_pos_train.append(true_sample['q_et'])
answer_neg_train.append(false_sample['c_et'])
answer_pos_train.append(true_sample['c_et'])
i=int(false_sample['cet_id'])
answer_neg_m.append(list(vec[(i*50):((i+1)*50)]))
i=int(true_sample['cet_id'])
answer_pos_m.append(list(vec[(i*50):((i+1)*50)]))
i=int(false_sample['q_et_id'])
subject_neg_m.append(list(vec[(i*50):((i+1)*50)]))
i=int(true_sample['q_et_id'])
subject_pos_m.append(list(vec[(i*50):((i+1)*50)]))
pair_count=pair_count+1
out_dataset.write(json.dumps(question_train)+"\n")
out_dataset.write(json.dumps(subject_neg_train)+"\n")
out_dataset.write(json.dumps(subject_pos_train)+"\n")
out_dataset.write(json.dumps(answer_neg_train)+"\n")
out_dataset.write(json.dumps(answer_pos_train)+"\n")
out_dataset.write(json.dumps(str(answer_neg_m))+"\n")
out_dataset.write(json.dumps(str(answer_pos_m))+"\n")
out_dataset.write(json.dumps(str(subject_neg_m))+"\n")
out_dataset.write(json.dumps(str(subject_pos_m))+"\n")
out_dataset.close()
result file size is 16GB
when i want to read the file my memory will be fulled completely and code will be killed. my server has 250GB RAM
here is my code for reading.
so whats wrong with my codes.
import ast
import json
train_dir = '../create-data/data/1-train-with-50-random-embedded-neg-sample.json'
# train_dir = '../create-data/data/1-train-one-embeded.json'
data_train = open(train_dir, "r")
question_train=json.loads(data_train.readline())
subject_neg_train=json.loads(data_train.readline())
subject_pos_train=json.loads(data_train.readline())
answer_neg_train=json.loads(data_train.readline())
answer_pos_train=json.loads(data_train.readline())
answer_neg_m=ast.literal_eval(json.loads(data_train.readline()))
answer_pos_m=ast.literal_eval(json.loads(data_train.readline()))
subject_neg_m=ast.literal_eval(json.loads(data_train.readline()))
subject_pos_m=ast.literal_eval(json.loads(data_train.readline()))
print(type(subject_neg_train))
print(len(subject_neg_train))
print(type(subject_pos_m))
print(len(subject_pos_m))
print(type(subject_pos_m[0]))
Solution 1:[1]
Try pandas
import pandas as pd
df = pd.read_json(train_dir)
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
Solution | Source |
---|---|
Solution 1 | Mehran Rahmanzadeh |