Speed up iteration for large dataset in python
$begingroup$
I'm trying to parse data into a csv prior to a bulk transaction into Neo4j. I'm using vast amounts of data in the relationships and wondered if anyone could help in speeding up the transactions below. It's genomic data, with 2,000 samples, each with up to 3.5m variants per chromosome probably equating to about 40m variants, up to two rows per sample, so ~120,000,000,000 rows in total. Currently it's taking me about 20 minutes per sample. Does anyone have any suggestions on how to improve this:
import sys
import time
import datetime
import numpy as np
import allel
import zarr
import numcodecs
import os
import pandas
import csv
import math
import dask.array as da
vcf_directory = '/media/user/Seagate Backup Plus Drive/uk_alspac/'
zarr_path = vcf_directory + 'chroms.zarr'
callset = zarr.open_group(zarr_path, mode='r')
samples_fn = '/media/user/Seagate Backup Plus Drive/uk_alspac/phenotype_data/EGAZ00001016605_UK10K_ALSPAC_Phenotype_Data_August_2013_1867samples.txt'
panel = pandas.DataFrame.from_csv(samples_fn, sep='t')
chrom_list = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,'X']
i = 0
j = 0
seq_tech = 'Illumina HiSeq 2000 (ILLUMINA)'
with open('/media/user/Seagate Backup Plus Drive/uk_alspac/Import_files/sample_variants.csv', 'w') as csvfile:
filewriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
filewriter.writerow([':START_ID(Sample)','type', 'hapA', 'hapB','genotype', 'seqtech', 'read_depth', 'phase_set','GP0','GP1','GP2','PL0','PL1','PL2', ':END_ID(Variant)'])
for chrom in chrom_list:
print(chrom)
datetime_object = datetime.datetime.now()
print(datetime_object)
sampledata = callset[chrom]['samples']
samples = list(sampledata)
variants = allel.VariantChunkedTable(callset[chrom]['variants'], names=['AC','AF_AFR', 'AF_AMR', 'AF_ASN', 'AF_EUR', 'AF_MAX', 'CGT', 'CLR', 'CSQ', 'DP', 'DP4', 'ESP_MAF', 'FILTER_LowQual', 'FILTER_MinHWE', 'FILTER_MinVQSLOD', 'FILTER_PASS', 'HWE', 'ICF', 'ID', 'IS', 'PC2', 'PCHI2', 'POS', 'PR', 'QCHI2', 'QUAL', 'REF', 'ALT', 'INDEL', 'SHAPEIT', 'SNP_ID', 'TYPE', 'UGT', 'VQSLOD', 'dbSNPmismatch', 'is_snp', 'numalt', 'svlen'], index='POS')
pos = variants['POS'][:]
SNPid = variants['ID'][:]
ref = variants['REF'][:]
alt = variants['ALT'][:]
dp = variants['DP'][:]
ac = variants['AC'][:]
vartype = variants['TYPE'][:]
svlen = variants['svlen'][:]
qual = variants['QUAL'][:]
vq = variants['VQSLOD'][:]
numalt = variants['numalt'][:]
csq = variants['CSQ'][:]
vcfv = 'VCFv4.1'
refv = 'https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.13/'
calldata = callset[chrom]['calldata']
dpz = calldata['DP']
psz = calldata['PS']
plz = calldata['PL']
gpz = calldata['GP']
gtz = calldata['GT']
i = 0
j = 0
seq_tech = 'Illumina HiSeq 2000 (ILLUMINA)'
for i in range(50):
print("Chrom " + str(chrom) + "sample" + str(i))
print(datetime_object)
gt = gtz[:,i].tolist()
hap1, hap2 = zip(*gt)
dp = dpz[:, i]
ps = psz[:, i]
pl = plz[:, i]
gp = gpz[:, i]
subject = samples[i]
for j in range(len(pos)):
h1 = int(hap1[j])
h2 = int(hap2[j])
read_depth = int(dp[j])
ps1 = int(ps[j])
PL0 = int(pl[j][0])
PL1 = int(pl[j][1])
PL2 = int(pl[j][2])
GP0 = float(gp[j][0])
GP1 = float(gp[j][1])
GP2 = float(gp[j][2])
if h1 == 0 and h2 == 0:
filewriter.writerow([subject,"Homozygous",h1 ,h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + ref[j]])
elif h1 == 0 and h2 > 0:
filewriter.writerow([subject,"Heterozygous - Haplotype A",h1 ,'', str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + ref[j]])
filewriter.writerow([subject,"Heterozygous - Haplotype B",'',h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h2-1]])
elif h1 > 0 and h2 == 0:
filewriter.writerow([subject,"Heterozygous - Haplotype A",h1, '', str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h1-1]])
filewriter.writerow([subject,"Heterozygous - Haplotype B",'' ,h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + ref[j]])
elif h1 == h2 and h1 > 0:
filewriter.writerow([subject,"Homozygous",h1, h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h1-1]])
else:
filewriter.writerow([subject,"Heterozygous - Haplotype A",h1,'', str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h1-1]])
filewriter.writerow([subject,"Heterozygous - Haplotype B",'',h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h2-1]])
It was all done a bit ad-hoc and I'm still new to python, so any tips would be appreciated.
python performance csv iteration
New contributor
$endgroup$
add a comment |
$begingroup$
I'm trying to parse data into a csv prior to a bulk transaction into Neo4j. I'm using vast amounts of data in the relationships and wondered if anyone could help in speeding up the transactions below. It's genomic data, with 2,000 samples, each with up to 3.5m variants per chromosome probably equating to about 40m variants, up to two rows per sample, so ~120,000,000,000 rows in total. Currently it's taking me about 20 minutes per sample. Does anyone have any suggestions on how to improve this:
import sys
import time
import datetime
import numpy as np
import allel
import zarr
import numcodecs
import os
import pandas
import csv
import math
import dask.array as da
vcf_directory = '/media/user/Seagate Backup Plus Drive/uk_alspac/'
zarr_path = vcf_directory + 'chroms.zarr'
callset = zarr.open_group(zarr_path, mode='r')
samples_fn = '/media/user/Seagate Backup Plus Drive/uk_alspac/phenotype_data/EGAZ00001016605_UK10K_ALSPAC_Phenotype_Data_August_2013_1867samples.txt'
panel = pandas.DataFrame.from_csv(samples_fn, sep='t')
chrom_list = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,'X']
i = 0
j = 0
seq_tech = 'Illumina HiSeq 2000 (ILLUMINA)'
with open('/media/user/Seagate Backup Plus Drive/uk_alspac/Import_files/sample_variants.csv', 'w') as csvfile:
filewriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
filewriter.writerow([':START_ID(Sample)','type', 'hapA', 'hapB','genotype', 'seqtech', 'read_depth', 'phase_set','GP0','GP1','GP2','PL0','PL1','PL2', ':END_ID(Variant)'])
for chrom in chrom_list:
print(chrom)
datetime_object = datetime.datetime.now()
print(datetime_object)
sampledata = callset[chrom]['samples']
samples = list(sampledata)
variants = allel.VariantChunkedTable(callset[chrom]['variants'], names=['AC','AF_AFR', 'AF_AMR', 'AF_ASN', 'AF_EUR', 'AF_MAX', 'CGT', 'CLR', 'CSQ', 'DP', 'DP4', 'ESP_MAF', 'FILTER_LowQual', 'FILTER_MinHWE', 'FILTER_MinVQSLOD', 'FILTER_PASS', 'HWE', 'ICF', 'ID', 'IS', 'PC2', 'PCHI2', 'POS', 'PR', 'QCHI2', 'QUAL', 'REF', 'ALT', 'INDEL', 'SHAPEIT', 'SNP_ID', 'TYPE', 'UGT', 'VQSLOD', 'dbSNPmismatch', 'is_snp', 'numalt', 'svlen'], index='POS')
pos = variants['POS'][:]
SNPid = variants['ID'][:]
ref = variants['REF'][:]
alt = variants['ALT'][:]
dp = variants['DP'][:]
ac = variants['AC'][:]
vartype = variants['TYPE'][:]
svlen = variants['svlen'][:]
qual = variants['QUAL'][:]
vq = variants['VQSLOD'][:]
numalt = variants['numalt'][:]
csq = variants['CSQ'][:]
vcfv = 'VCFv4.1'
refv = 'https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.13/'
calldata = callset[chrom]['calldata']
dpz = calldata['DP']
psz = calldata['PS']
plz = calldata['PL']
gpz = calldata['GP']
gtz = calldata['GT']
i = 0
j = 0
seq_tech = 'Illumina HiSeq 2000 (ILLUMINA)'
for i in range(50):
print("Chrom " + str(chrom) + "sample" + str(i))
print(datetime_object)
gt = gtz[:,i].tolist()
hap1, hap2 = zip(*gt)
dp = dpz[:, i]
ps = psz[:, i]
pl = plz[:, i]
gp = gpz[:, i]
subject = samples[i]
for j in range(len(pos)):
h1 = int(hap1[j])
h2 = int(hap2[j])
read_depth = int(dp[j])
ps1 = int(ps[j])
PL0 = int(pl[j][0])
PL1 = int(pl[j][1])
PL2 = int(pl[j][2])
GP0 = float(gp[j][0])
GP1 = float(gp[j][1])
GP2 = float(gp[j][2])
if h1 == 0 and h2 == 0:
filewriter.writerow([subject,"Homozygous",h1 ,h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + ref[j]])
elif h1 == 0 and h2 > 0:
filewriter.writerow([subject,"Heterozygous - Haplotype A",h1 ,'', str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + ref[j]])
filewriter.writerow([subject,"Heterozygous - Haplotype B",'',h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h2-1]])
elif h1 > 0 and h2 == 0:
filewriter.writerow([subject,"Heterozygous - Haplotype A",h1, '', str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h1-1]])
filewriter.writerow([subject,"Heterozygous - Haplotype B",'' ,h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + ref[j]])
elif h1 == h2 and h1 > 0:
filewriter.writerow([subject,"Homozygous",h1, h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h1-1]])
else:
filewriter.writerow([subject,"Heterozygous - Haplotype A",h1,'', str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h1-1]])
filewriter.writerow([subject,"Heterozygous - Haplotype B",'',h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h2-1]])
It was all done a bit ad-hoc and I'm still new to python, so any tips would be appreciated.
python performance csv iteration
New contributor
$endgroup$
$begingroup$
Thanks @Ludisposed. Can you tell me what difference it will be making here? The results seem to come out as anticipated and I'm hesitant to stop the loop as I'll have to start over again.
$endgroup$
– Dave C
6 hours ago
$begingroup$
Ah yes, sorry @Ludisposed that was just a formatting error when writing the question. I've corrected it now. It's running at the moment, I just wondered if there was any way to speed up the loop.
$endgroup$
– Dave C
6 hours ago
add a comment |
$begingroup$
I'm trying to parse data into a csv prior to a bulk transaction into Neo4j. I'm using vast amounts of data in the relationships and wondered if anyone could help in speeding up the transactions below. It's genomic data, with 2,000 samples, each with up to 3.5m variants per chromosome probably equating to about 40m variants, up to two rows per sample, so ~120,000,000,000 rows in total. Currently it's taking me about 20 minutes per sample. Does anyone have any suggestions on how to improve this:
import sys
import time
import datetime
import numpy as np
import allel
import zarr
import numcodecs
import os
import pandas
import csv
import math
import dask.array as da
vcf_directory = '/media/user/Seagate Backup Plus Drive/uk_alspac/'
zarr_path = vcf_directory + 'chroms.zarr'
callset = zarr.open_group(zarr_path, mode='r')
samples_fn = '/media/user/Seagate Backup Plus Drive/uk_alspac/phenotype_data/EGAZ00001016605_UK10K_ALSPAC_Phenotype_Data_August_2013_1867samples.txt'
panel = pandas.DataFrame.from_csv(samples_fn, sep='t')
chrom_list = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,'X']
i = 0
j = 0
seq_tech = 'Illumina HiSeq 2000 (ILLUMINA)'
with open('/media/user/Seagate Backup Plus Drive/uk_alspac/Import_files/sample_variants.csv', 'w') as csvfile:
filewriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
filewriter.writerow([':START_ID(Sample)','type', 'hapA', 'hapB','genotype', 'seqtech', 'read_depth', 'phase_set','GP0','GP1','GP2','PL0','PL1','PL2', ':END_ID(Variant)'])
for chrom in chrom_list:
print(chrom)
datetime_object = datetime.datetime.now()
print(datetime_object)
sampledata = callset[chrom]['samples']
samples = list(sampledata)
variants = allel.VariantChunkedTable(callset[chrom]['variants'], names=['AC','AF_AFR', 'AF_AMR', 'AF_ASN', 'AF_EUR', 'AF_MAX', 'CGT', 'CLR', 'CSQ', 'DP', 'DP4', 'ESP_MAF', 'FILTER_LowQual', 'FILTER_MinHWE', 'FILTER_MinVQSLOD', 'FILTER_PASS', 'HWE', 'ICF', 'ID', 'IS', 'PC2', 'PCHI2', 'POS', 'PR', 'QCHI2', 'QUAL', 'REF', 'ALT', 'INDEL', 'SHAPEIT', 'SNP_ID', 'TYPE', 'UGT', 'VQSLOD', 'dbSNPmismatch', 'is_snp', 'numalt', 'svlen'], index='POS')
pos = variants['POS'][:]
SNPid = variants['ID'][:]
ref = variants['REF'][:]
alt = variants['ALT'][:]
dp = variants['DP'][:]
ac = variants['AC'][:]
vartype = variants['TYPE'][:]
svlen = variants['svlen'][:]
qual = variants['QUAL'][:]
vq = variants['VQSLOD'][:]
numalt = variants['numalt'][:]
csq = variants['CSQ'][:]
vcfv = 'VCFv4.1'
refv = 'https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.13/'
calldata = callset[chrom]['calldata']
dpz = calldata['DP']
psz = calldata['PS']
plz = calldata['PL']
gpz = calldata['GP']
gtz = calldata['GT']
i = 0
j = 0
seq_tech = 'Illumina HiSeq 2000 (ILLUMINA)'
for i in range(50):
print("Chrom " + str(chrom) + "sample" + str(i))
print(datetime_object)
gt = gtz[:,i].tolist()
hap1, hap2 = zip(*gt)
dp = dpz[:, i]
ps = psz[:, i]
pl = plz[:, i]
gp = gpz[:, i]
subject = samples[i]
for j in range(len(pos)):
h1 = int(hap1[j])
h2 = int(hap2[j])
read_depth = int(dp[j])
ps1 = int(ps[j])
PL0 = int(pl[j][0])
PL1 = int(pl[j][1])
PL2 = int(pl[j][2])
GP0 = float(gp[j][0])
GP1 = float(gp[j][1])
GP2 = float(gp[j][2])
if h1 == 0 and h2 == 0:
filewriter.writerow([subject,"Homozygous",h1 ,h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + ref[j]])
elif h1 == 0 and h2 > 0:
filewriter.writerow([subject,"Heterozygous - Haplotype A",h1 ,'', str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + ref[j]])
filewriter.writerow([subject,"Heterozygous - Haplotype B",'',h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h2-1]])
elif h1 > 0 and h2 == 0:
filewriter.writerow([subject,"Heterozygous - Haplotype A",h1, '', str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h1-1]])
filewriter.writerow([subject,"Heterozygous - Haplotype B",'' ,h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + ref[j]])
elif h1 == h2 and h1 > 0:
filewriter.writerow([subject,"Homozygous",h1, h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h1-1]])
else:
filewriter.writerow([subject,"Heterozygous - Haplotype A",h1,'', str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h1-1]])
filewriter.writerow([subject,"Heterozygous - Haplotype B",'',h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h2-1]])
It was all done a bit ad-hoc and I'm still new to python, so any tips would be appreciated.
python performance csv iteration
New contributor
$endgroup$
I'm trying to parse data into a csv prior to a bulk transaction into Neo4j. I'm using vast amounts of data in the relationships and wondered if anyone could help in speeding up the transactions below. It's genomic data, with 2,000 samples, each with up to 3.5m variants per chromosome probably equating to about 40m variants, up to two rows per sample, so ~120,000,000,000 rows in total. Currently it's taking me about 20 minutes per sample. Does anyone have any suggestions on how to improve this:
import sys
import time
import datetime
import numpy as np
import allel
import zarr
import numcodecs
import os
import pandas
import csv
import math
import dask.array as da
vcf_directory = '/media/user/Seagate Backup Plus Drive/uk_alspac/'
zarr_path = vcf_directory + 'chroms.zarr'
callset = zarr.open_group(zarr_path, mode='r')
samples_fn = '/media/user/Seagate Backup Plus Drive/uk_alspac/phenotype_data/EGAZ00001016605_UK10K_ALSPAC_Phenotype_Data_August_2013_1867samples.txt'
panel = pandas.DataFrame.from_csv(samples_fn, sep='t')
chrom_list = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,'X']
i = 0
j = 0
seq_tech = 'Illumina HiSeq 2000 (ILLUMINA)'
with open('/media/user/Seagate Backup Plus Drive/uk_alspac/Import_files/sample_variants.csv', 'w') as csvfile:
filewriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
filewriter.writerow([':START_ID(Sample)','type', 'hapA', 'hapB','genotype', 'seqtech', 'read_depth', 'phase_set','GP0','GP1','GP2','PL0','PL1','PL2', ':END_ID(Variant)'])
for chrom in chrom_list:
print(chrom)
datetime_object = datetime.datetime.now()
print(datetime_object)
sampledata = callset[chrom]['samples']
samples = list(sampledata)
variants = allel.VariantChunkedTable(callset[chrom]['variants'], names=['AC','AF_AFR', 'AF_AMR', 'AF_ASN', 'AF_EUR', 'AF_MAX', 'CGT', 'CLR', 'CSQ', 'DP', 'DP4', 'ESP_MAF', 'FILTER_LowQual', 'FILTER_MinHWE', 'FILTER_MinVQSLOD', 'FILTER_PASS', 'HWE', 'ICF', 'ID', 'IS', 'PC2', 'PCHI2', 'POS', 'PR', 'QCHI2', 'QUAL', 'REF', 'ALT', 'INDEL', 'SHAPEIT', 'SNP_ID', 'TYPE', 'UGT', 'VQSLOD', 'dbSNPmismatch', 'is_snp', 'numalt', 'svlen'], index='POS')
pos = variants['POS'][:]
SNPid = variants['ID'][:]
ref = variants['REF'][:]
alt = variants['ALT'][:]
dp = variants['DP'][:]
ac = variants['AC'][:]
vartype = variants['TYPE'][:]
svlen = variants['svlen'][:]
qual = variants['QUAL'][:]
vq = variants['VQSLOD'][:]
numalt = variants['numalt'][:]
csq = variants['CSQ'][:]
vcfv = 'VCFv4.1'
refv = 'https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.13/'
calldata = callset[chrom]['calldata']
dpz = calldata['DP']
psz = calldata['PS']
plz = calldata['PL']
gpz = calldata['GP']
gtz = calldata['GT']
i = 0
j = 0
seq_tech = 'Illumina HiSeq 2000 (ILLUMINA)'
for i in range(50):
print("Chrom " + str(chrom) + "sample" + str(i))
print(datetime_object)
gt = gtz[:,i].tolist()
hap1, hap2 = zip(*gt)
dp = dpz[:, i]
ps = psz[:, i]
pl = plz[:, i]
gp = gpz[:, i]
subject = samples[i]
for j in range(len(pos)):
h1 = int(hap1[j])
h2 = int(hap2[j])
read_depth = int(dp[j])
ps1 = int(ps[j])
PL0 = int(pl[j][0])
PL1 = int(pl[j][1])
PL2 = int(pl[j][2])
GP0 = float(gp[j][0])
GP1 = float(gp[j][1])
GP2 = float(gp[j][2])
if h1 == 0 and h2 == 0:
filewriter.writerow([subject,"Homozygous",h1 ,h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + ref[j]])
elif h1 == 0 and h2 > 0:
filewriter.writerow([subject,"Heterozygous - Haplotype A",h1 ,'', str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + ref[j]])
filewriter.writerow([subject,"Heterozygous - Haplotype B",'',h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h2-1]])
elif h1 > 0 and h2 == 0:
filewriter.writerow([subject,"Heterozygous - Haplotype A",h1, '', str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h1-1]])
filewriter.writerow([subject,"Heterozygous - Haplotype B",'' ,h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + ref[j]])
elif h1 == h2 and h1 > 0:
filewriter.writerow([subject,"Homozygous",h1, h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h1-1]])
else:
filewriter.writerow([subject,"Heterozygous - Haplotype A",h1,'', str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h1-1]])
filewriter.writerow([subject,"Heterozygous - Haplotype B",'',h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h2-1]])
It was all done a bit ad-hoc and I'm still new to python, so any tips would be appreciated.
python performance csv iteration
python performance csv iteration
New contributor
New contributor
edited 6 hours ago
Dave C
New contributor
asked 7 hours ago
Dave CDave C
113
113
New contributor
New contributor
$begingroup$
Thanks @Ludisposed. Can you tell me what difference it will be making here? The results seem to come out as anticipated and I'm hesitant to stop the loop as I'll have to start over again.
$endgroup$
– Dave C
6 hours ago
$begingroup$
Ah yes, sorry @Ludisposed that was just a formatting error when writing the question. I've corrected it now. It's running at the moment, I just wondered if there was any way to speed up the loop.
$endgroup$
– Dave C
6 hours ago
add a comment |
$begingroup$
Thanks @Ludisposed. Can you tell me what difference it will be making here? The results seem to come out as anticipated and I'm hesitant to stop the loop as I'll have to start over again.
$endgroup$
– Dave C
6 hours ago
$begingroup$
Ah yes, sorry @Ludisposed that was just a formatting error when writing the question. I've corrected it now. It's running at the moment, I just wondered if there was any way to speed up the loop.
$endgroup$
– Dave C
6 hours ago
$begingroup$
Thanks @Ludisposed. Can you tell me what difference it will be making here? The results seem to come out as anticipated and I'm hesitant to stop the loop as I'll have to start over again.
$endgroup$
– Dave C
6 hours ago
$begingroup$
Thanks @Ludisposed. Can you tell me what difference it will be making here? The results seem to come out as anticipated and I'm hesitant to stop the loop as I'll have to start over again.
$endgroup$
– Dave C
6 hours ago
$begingroup$
Ah yes, sorry @Ludisposed that was just a formatting error when writing the question. I've corrected it now. It's running at the moment, I just wondered if there was any way to speed up the loop.
$endgroup$
– Dave C
6 hours ago
$begingroup$
Ah yes, sorry @Ludisposed that was just a formatting error when writing the question. I've corrected it now. It's running at the moment, I just wondered if there was any way to speed up the loop.
$endgroup$
– Dave C
6 hours ago
add a comment |
0
active
oldest
votes
Your Answer
StackExchange.ifUsing("editor", function () {
return StackExchange.using("mathjaxEditing", function () {
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
});
});
}, "mathjax-editing");
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "196"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: false,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Dave C is a new contributor. Be nice, and check out our Code of Conduct.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f211930%2fspeed-up-iteration-for-large-dataset-in-python%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
0
active
oldest
votes
0
active
oldest
votes
active
oldest
votes
active
oldest
votes
Dave C is a new contributor. Be nice, and check out our Code of Conduct.
Dave C is a new contributor. Be nice, and check out our Code of Conduct.
Dave C is a new contributor. Be nice, and check out our Code of Conduct.
Dave C is a new contributor. Be nice, and check out our Code of Conduct.
Thanks for contributing an answer to Code Review Stack Exchange!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
Use MathJax to format equations. MathJax reference.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f211930%2fspeed-up-iteration-for-large-dataset-in-python%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
$begingroup$
Thanks @Ludisposed. Can you tell me what difference it will be making here? The results seem to come out as anticipated and I'm hesitant to stop the loop as I'll have to start over again.
$endgroup$
– Dave C
6 hours ago
$begingroup$
Ah yes, sorry @Ludisposed that was just a formatting error when writing the question. I've corrected it now. It's running at the moment, I just wondered if there was any way to speed up the loop.
$endgroup$
– Dave C
6 hours ago