Speed up iteration for large dataset in python

I'm trying to parse data into a csv prior to a bulk transaction into Neo4j. I'm using vast amounts of data in the relationships and wondered if anyone could help in speeding up the transactions below. It's genomic data, with 2,000 samples, each with up to 3.5m variants per chromosome probably equating to about 40m variants, up to two rows per sample, so ~120,000,000,000 rows in total. Currently it's taking me about 20 minutes per sample. Does anyone have any suggestions on how to improve this:

import sys

import time

import datetime

import numpy as np

import allel

import zarr

import numcodecs

import os

import pandas

import csv

import math

import dask.array as da

vcf_directory = '/media/user/Seagate Backup Plus Drive/uk_alspac/'

zarr_path = vcf_directory + 'chroms.zarr'

callset = zarr.open_group(zarr_path, mode='r')

samples_fn = '/media/user/Seagate Backup Plus Drive/uk_alspac/phenotype_data/EGAZ00001016605_UK10K_ALSPAC_Phenotype_Data_August_2013_1867samples.txt'

panel = pandas.DataFrame.from_csv(samples_fn, sep='t')

chrom_list = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,'X']

i = 0

j = 0

seq_tech = 'Illumina HiSeq 2000 (ILLUMINA)'

with open('/media/user/Seagate Backup Plus Drive/uk_alspac/Import_files/sample_variants.csv', 'w') as csvfile:

    filewriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)

    filewriter.writerow([':START_ID(Sample)','type', 'hapA', 'hapB','genotype', 'seqtech', 'read_depth', 'phase_set','GP0','GP1','GP2','PL0','PL1','PL2', ':END_ID(Variant)'])

    for chrom in chrom_list:

        print(chrom)

        datetime_object = datetime.datetime.now()

        print(datetime_object) 

        sampledata = callset[chrom]['samples']

        samples = list(sampledata)

        variants = allel.VariantChunkedTable(callset[chrom]['variants'], names=['AC','AF_AFR', 'AF_AMR', 'AF_ASN', 'AF_EUR', 'AF_MAX', 'CGT', 'CLR', 'CSQ', 'DP', 'DP4', 'ESP_MAF', 'FILTER_LowQual', 'FILTER_MinHWE', 'FILTER_MinVQSLOD', 'FILTER_PASS', 'HWE', 'ICF', 'ID', 'IS', 'PC2', 'PCHI2', 'POS', 'PR', 'QCHI2', 'QUAL', 'REF', 'ALT', 'INDEL', 'SHAPEIT', 'SNP_ID', 'TYPE', 'UGT', 'VQSLOD', 'dbSNPmismatch', 'is_snp', 'numalt', 'svlen'], index='POS')

        pos = variants['POS'][:]

        SNPid = variants['ID'][:]

        ref = variants['REF'][:]

        alt = variants['ALT'][:]

        dp = variants['DP'][:]

        ac = variants['AC'][:]

        vartype = variants['TYPE'][:]

        svlen = variants['svlen'][:]

        qual = variants['QUAL'][:]

        vq = variants['VQSLOD'][:]

        numalt = variants['numalt'][:]

        csq = variants['CSQ'][:]

        vcfv = 'VCFv4.1'

        refv = 'https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.13/'

        calldata = callset[chrom]['calldata']

        dpz = calldata['DP']

        psz = calldata['PS']

        plz = calldata['PL']

        gpz = calldata['GP']

        gtz = calldata['GT']

        i = 0

        j = 0

        seq_tech = 'Illumina HiSeq 2000 (ILLUMINA)'

        for i in range(50):

            print("Chrom " + str(chrom) + "sample" + str(i))

            print(datetime_object)

            gt = gtz[:,i].tolist()

            hap1, hap2 = zip(*gt)

            dp = dpz[:, i]

            ps = psz[:, i]

            pl = plz[:, i]

            gp = gpz[:, i]

            subject = samples[i]

            for j in range(len(pos)): 

                h1 = int(hap1[j])

                h2 = int(hap2[j])

                read_depth = int(dp[j])

                ps1 = int(ps[j])

                PL0 = int(pl[j][0])

                PL1 = int(pl[j][1])

                PL2 = int(pl[j][2])        

                GP0 = float(gp[j][0])

                GP1 = float(gp[j][1])

                GP2 = float(gp[j][2])



                if h1 == 0 and h2 == 0:

                    filewriter.writerow([subject,"Homozygous",h1 ,h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + ref[j]])

                elif h1 == 0 and h2 > 0:  

                    filewriter.writerow([subject,"Heterozygous - Haplotype A",h1 ,'', str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + ref[j]])

                    filewriter.writerow([subject,"Heterozygous - Haplotype B",'',h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h2-1]])

                elif h1 > 0 and h2 == 0:

                    filewriter.writerow([subject,"Heterozygous - Haplotype A",h1, '', str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h1-1]])

                    filewriter.writerow([subject,"Heterozygous - Haplotype B",'' ,h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + ref[j]])

                elif h1 == h2 and h1 > 0:

                    filewriter.writerow([subject,"Homozygous",h1, h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h1-1]])     

                else:

                    filewriter.writerow([subject,"Heterozygous - Haplotype A",h1,'', str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h1-1]])

                    filewriter.writerow([subject,"Heterozygous - Haplotype B",'',h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h2-1]])

It was all done a bit ad-hoc and I'm still new to python, so any tips would be appreciated.

edited 6 hours ago

asked 7 hours ago

Dave C

113

New contributor

$begingroup$
Thanks @Ludisposed. Can you tell me what difference it will be making here? The results seem to come out as anticipated and I'm hesitant to stop the loop as I'll have to start over again.
$endgroup$
– Dave C
6 hours ago

$begingroup$
Ah yes, sorry @Ludisposed that was just a formatting error when writing the question. I've corrected it now. It's running at the moment, I just wondered if there was any way to speed up the loop.
$endgroup$
– Dave C
6 hours ago

add a comment |

import sys

import time

import datetime

import numpy as np

import allel

import zarr

import numcodecs

import os

import pandas

import csv

import math

import dask.array as da

vcf_directory = '/media/user/Seagate Backup Plus Drive/uk_alspac/'

zarr_path = vcf_directory + 'chroms.zarr'

callset = zarr.open_group(zarr_path, mode='r')

samples_fn = '/media/user/Seagate Backup Plus Drive/uk_alspac/phenotype_data/EGAZ00001016605_UK10K_ALSPAC_Phenotype_Data_August_2013_1867samples.txt'

panel = pandas.DataFrame.from_csv(samples_fn, sep='t')

chrom_list = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,'X']

i = 0

j = 0

seq_tech = 'Illumina HiSeq 2000 (ILLUMINA)'

with open('/media/user/Seagate Backup Plus Drive/uk_alspac/Import_files/sample_variants.csv', 'w') as csvfile:

    filewriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)

    filewriter.writerow([':START_ID(Sample)','type', 'hapA', 'hapB','genotype', 'seqtech', 'read_depth', 'phase_set','GP0','GP1','GP2','PL0','PL1','PL2', ':END_ID(Variant)'])

    for chrom in chrom_list:

        print(chrom)

        datetime_object = datetime.datetime.now()

        print(datetime_object) 

        sampledata = callset[chrom]['samples']

        samples = list(sampledata)

        variants = allel.VariantChunkedTable(callset[chrom]['variants'], names=['AC','AF_AFR', 'AF_AMR', 'AF_ASN', 'AF_EUR', 'AF_MAX', 'CGT', 'CLR', 'CSQ', 'DP', 'DP4', 'ESP_MAF', 'FILTER_LowQual', 'FILTER_MinHWE', 'FILTER_MinVQSLOD', 'FILTER_PASS', 'HWE', 'ICF', 'ID', 'IS', 'PC2', 'PCHI2', 'POS', 'PR', 'QCHI2', 'QUAL', 'REF', 'ALT', 'INDEL', 'SHAPEIT', 'SNP_ID', 'TYPE', 'UGT', 'VQSLOD', 'dbSNPmismatch', 'is_snp', 'numalt', 'svlen'], index='POS')

        pos = variants['POS'][:]

        SNPid = variants['ID'][:]

        ref = variants['REF'][:]

        alt = variants['ALT'][:]

        dp = variants['DP'][:]

        ac = variants['AC'][:]

        vartype = variants['TYPE'][:]

        svlen = variants['svlen'][:]

        qual = variants['QUAL'][:]

        vq = variants['VQSLOD'][:]

        numalt = variants['numalt'][:]

        csq = variants['CSQ'][:]

        vcfv = 'VCFv4.1'

        refv = 'https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.13/'

        calldata = callset[chrom]['calldata']

        dpz = calldata['DP']

        psz = calldata['PS']

        plz = calldata['PL']

        gpz = calldata['GP']

        gtz = calldata['GT']

        i = 0

        j = 0

        seq_tech = 'Illumina HiSeq 2000 (ILLUMINA)'

        for i in range(50):

            print("Chrom " + str(chrom) + "sample" + str(i))

            print(datetime_object)

            gt = gtz[:,i].tolist()

            hap1, hap2 = zip(*gt)

            dp = dpz[:, i]

            ps = psz[:, i]

            pl = plz[:, i]

            gp = gpz[:, i]

            subject = samples[i]

            for j in range(len(pos)): 

                h1 = int(hap1[j])

                h2 = int(hap2[j])

                read_depth = int(dp[j])

                ps1 = int(ps[j])

                PL0 = int(pl[j][0])

                PL1 = int(pl[j][1])

                PL2 = int(pl[j][2])        

                GP0 = float(gp[j][0])

                GP1 = float(gp[j][1])

                GP2 = float(gp[j][2])



                if h1 == 0 and h2 == 0:

                    filewriter.writerow([subject,"Homozygous",h1 ,h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + ref[j]])

                elif h1 == 0 and h2 > 0:  

                    filewriter.writerow([subject,"Heterozygous - Haplotype A",h1 ,'', str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + ref[j]])

                    filewriter.writerow([subject,"Heterozygous - Haplotype B",'',h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h2-1]])

                elif h1 > 0 and h2 == 0:

                    filewriter.writerow([subject,"Heterozygous - Haplotype A",h1, '', str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h1-1]])

                    filewriter.writerow([subject,"Heterozygous - Haplotype B",'' ,h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + ref[j]])

                elif h1 == h2 and h1 > 0:

                    filewriter.writerow([subject,"Homozygous",h1, h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h1-1]])     

                else:

                    filewriter.writerow([subject,"Heterozygous - Haplotype A",h1,'', str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h1-1]])

                    filewriter.writerow([subject,"Heterozygous - Haplotype B",'',h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h2-1]])

It was all done a bit ad-hoc and I'm still new to python, so any tips would be appreciated.

edited 6 hours ago

asked 7 hours ago

Dave C

113

New contributor

$begingroup$
Thanks @Ludisposed. Can you tell me what difference it will be making here? The results seem to come out as anticipated and I'm hesitant to stop the loop as I'll have to start over again.
$endgroup$
– Dave C
6 hours ago

$begingroup$
Ah yes, sorry @Ludisposed that was just a formatting error when writing the question. I've corrected it now. It's running at the moment, I just wondered if there was any way to speed up the loop.
$endgroup$
– Dave C
6 hours ago

add a comment |

import sys

import time

import datetime

import numpy as np

import allel

import zarr

import numcodecs

import os

import pandas

import csv

import math

import dask.array as da

vcf_directory = '/media/user/Seagate Backup Plus Drive/uk_alspac/'

zarr_path = vcf_directory + 'chroms.zarr'

callset = zarr.open_group(zarr_path, mode='r')

samples_fn = '/media/user/Seagate Backup Plus Drive/uk_alspac/phenotype_data/EGAZ00001016605_UK10K_ALSPAC_Phenotype_Data_August_2013_1867samples.txt'

panel = pandas.DataFrame.from_csv(samples_fn, sep='t')

chrom_list = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,'X']

i = 0

j = 0

seq_tech = 'Illumina HiSeq 2000 (ILLUMINA)'

with open('/media/user/Seagate Backup Plus Drive/uk_alspac/Import_files/sample_variants.csv', 'w') as csvfile:

    filewriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)

    filewriter.writerow([':START_ID(Sample)','type', 'hapA', 'hapB','genotype', 'seqtech', 'read_depth', 'phase_set','GP0','GP1','GP2','PL0','PL1','PL2', ':END_ID(Variant)'])

    for chrom in chrom_list:

        print(chrom)

        datetime_object = datetime.datetime.now()

        print(datetime_object) 

        sampledata = callset[chrom]['samples']

        samples = list(sampledata)

        variants = allel.VariantChunkedTable(callset[chrom]['variants'], names=['AC','AF_AFR', 'AF_AMR', 'AF_ASN', 'AF_EUR', 'AF_MAX', 'CGT', 'CLR', 'CSQ', 'DP', 'DP4', 'ESP_MAF', 'FILTER_LowQual', 'FILTER_MinHWE', 'FILTER_MinVQSLOD', 'FILTER_PASS', 'HWE', 'ICF', 'ID', 'IS', 'PC2', 'PCHI2', 'POS', 'PR', 'QCHI2', 'QUAL', 'REF', 'ALT', 'INDEL', 'SHAPEIT', 'SNP_ID', 'TYPE', 'UGT', 'VQSLOD', 'dbSNPmismatch', 'is_snp', 'numalt', 'svlen'], index='POS')

        pos = variants['POS'][:]

        SNPid = variants['ID'][:]

        ref = variants['REF'][:]

        alt = variants['ALT'][:]

        dp = variants['DP'][:]

        ac = variants['AC'][:]

        vartype = variants['TYPE'][:]

        svlen = variants['svlen'][:]

        qual = variants['QUAL'][:]

        vq = variants['VQSLOD'][:]

        numalt = variants['numalt'][:]

        csq = variants['CSQ'][:]

        vcfv = 'VCFv4.1'

        refv = 'https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.13/'

        calldata = callset[chrom]['calldata']

        dpz = calldata['DP']

        psz = calldata['PS']

        plz = calldata['PL']

        gpz = calldata['GP']

        gtz = calldata['GT']

        i = 0

        j = 0

        seq_tech = 'Illumina HiSeq 2000 (ILLUMINA)'

        for i in range(50):

            print("Chrom " + str(chrom) + "sample" + str(i))

            print(datetime_object)

            gt = gtz[:,i].tolist()

            hap1, hap2 = zip(*gt)

            dp = dpz[:, i]

            ps = psz[:, i]

            pl = plz[:, i]

            gp = gpz[:, i]

            subject = samples[i]

            for j in range(len(pos)): 

                h1 = int(hap1[j])

                h2 = int(hap2[j])

                read_depth = int(dp[j])

                ps1 = int(ps[j])

                PL0 = int(pl[j][0])

                PL1 = int(pl[j][1])

                PL2 = int(pl[j][2])        

                GP0 = float(gp[j][0])

                GP1 = float(gp[j][1])

                GP2 = float(gp[j][2])



                if h1 == 0 and h2 == 0:

                    filewriter.writerow([subject,"Homozygous",h1 ,h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + ref[j]])

                elif h1 == 0 and h2 > 0:  

                    filewriter.writerow([subject,"Heterozygous - Haplotype A",h1 ,'', str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + ref[j]])

                    filewriter.writerow([subject,"Heterozygous - Haplotype B",'',h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h2-1]])

                elif h1 > 0 and h2 == 0:

                    filewriter.writerow([subject,"Heterozygous - Haplotype A",h1, '', str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h1-1]])

                    filewriter.writerow([subject,"Heterozygous - Haplotype B",'' ,h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + ref[j]])

                elif h1 == h2 and h1 > 0:

                    filewriter.writerow([subject,"Homozygous",h1, h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h1-1]])     

                else:

                    filewriter.writerow([subject,"Heterozygous - Haplotype A",h1,'', str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h1-1]])

                    filewriter.writerow([subject,"Heterozygous - Haplotype B",'',h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h2-1]])

It was all done a bit ad-hoc and I'm still new to python, so any tips would be appreciated.

edited 6 hours ago

asked 7 hours ago

Dave C

113

New contributor

import sys

import time

import datetime

import numpy as np

import allel

import zarr

import numcodecs

import os

import pandas

import csv

import math

import dask.array as da

vcf_directory = '/media/user/Seagate Backup Plus Drive/uk_alspac/'

zarr_path = vcf_directory + 'chroms.zarr'

callset = zarr.open_group(zarr_path, mode='r')

samples_fn = '/media/user/Seagate Backup Plus Drive/uk_alspac/phenotype_data/EGAZ00001016605_UK10K_ALSPAC_Phenotype_Data_August_2013_1867samples.txt'

panel = pandas.DataFrame.from_csv(samples_fn, sep='t')

chrom_list = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,'X']

i = 0

j = 0

seq_tech = 'Illumina HiSeq 2000 (ILLUMINA)'

with open('/media/user/Seagate Backup Plus Drive/uk_alspac/Import_files/sample_variants.csv', 'w') as csvfile:

    filewriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)

    filewriter.writerow([':START_ID(Sample)','type', 'hapA', 'hapB','genotype', 'seqtech', 'read_depth', 'phase_set','GP0','GP1','GP2','PL0','PL1','PL2', ':END_ID(Variant)'])

    for chrom in chrom_list:

        print(chrom)

        datetime_object = datetime.datetime.now()

        print(datetime_object) 

        sampledata = callset[chrom]['samples']

        samples = list(sampledata)

        variants = allel.VariantChunkedTable(callset[chrom]['variants'], names=['AC','AF_AFR', 'AF_AMR', 'AF_ASN', 'AF_EUR', 'AF_MAX', 'CGT', 'CLR', 'CSQ', 'DP', 'DP4', 'ESP_MAF', 'FILTER_LowQual', 'FILTER_MinHWE', 'FILTER_MinVQSLOD', 'FILTER_PASS', 'HWE', 'ICF', 'ID', 'IS', 'PC2', 'PCHI2', 'POS', 'PR', 'QCHI2', 'QUAL', 'REF', 'ALT', 'INDEL', 'SHAPEIT', 'SNP_ID', 'TYPE', 'UGT', 'VQSLOD', 'dbSNPmismatch', 'is_snp', 'numalt', 'svlen'], index='POS')

        pos = variants['POS'][:]

        SNPid = variants['ID'][:]

        ref = variants['REF'][:]

        alt = variants['ALT'][:]

        dp = variants['DP'][:]

        ac = variants['AC'][:]

        vartype = variants['TYPE'][:]

        svlen = variants['svlen'][:]

        qual = variants['QUAL'][:]

        vq = variants['VQSLOD'][:]

        numalt = variants['numalt'][:]

        csq = variants['CSQ'][:]

        vcfv = 'VCFv4.1'

        refv = 'https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.13/'

        calldata = callset[chrom]['calldata']

        dpz = calldata['DP']

        psz = calldata['PS']

        plz = calldata['PL']

        gpz = calldata['GP']

        gtz = calldata['GT']

        i = 0

        j = 0

        seq_tech = 'Illumina HiSeq 2000 (ILLUMINA)'

        for i in range(50):

            print("Chrom " + str(chrom) + "sample" + str(i))

            print(datetime_object)

            gt = gtz[:,i].tolist()

            hap1, hap2 = zip(*gt)

            dp = dpz[:, i]

            ps = psz[:, i]

            pl = plz[:, i]

            gp = gpz[:, i]

            subject = samples[i]

            for j in range(len(pos)): 

                h1 = int(hap1[j])

                h2 = int(hap2[j])

                read_depth = int(dp[j])

                ps1 = int(ps[j])

                PL0 = int(pl[j][0])

                PL1 = int(pl[j][1])

                PL2 = int(pl[j][2])        

                GP0 = float(gp[j][0])

                GP1 = float(gp[j][1])

                GP2 = float(gp[j][2])



                if h1 == 0 and h2 == 0:

                    filewriter.writerow([subject,"Homozygous",h1 ,h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + ref[j]])

                elif h1 == 0 and h2 > 0:  

                    filewriter.writerow([subject,"Heterozygous - Haplotype A",h1 ,'', str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + ref[j]])

                    filewriter.writerow([subject,"Heterozygous - Haplotype B",'',h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h2-1]])

                elif h1 > 0 and h2 == 0:

                    filewriter.writerow([subject,"Heterozygous - Haplotype A",h1, '', str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h1-1]])

                    filewriter.writerow([subject,"Heterozygous - Haplotype B",'' ,h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + ref[j]])

                elif h1 == h2 and h1 > 0:

                    filewriter.writerow([subject,"Homozygous",h1, h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h1-1]])     

                else:

                    filewriter.writerow([subject,"Heterozygous - Haplotype A",h1,'', str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h1-1]])

                    filewriter.writerow([subject,"Heterozygous - Haplotype B",'',h2, str(h1) + '|' + str(h2), seq_tech, read_depth, ps1, GP0, GP1, GP2, PL0, PL1, PL2, str(chrom) + '-' + str(pos[j]) + alt[j][h2-1]])

It was all done a bit ad-hoc and I'm still new to python, so any tips would be appreciated.

python performance csv iteration

edited 6 hours ago

asked 7 hours ago

Dave C

113

New contributor

edited 6 hours ago

asked 7 hours ago

Dave C

113

New contributor

edited 6 hours ago

asked 7 hours ago

Dave C

113

New contributor

asked 7 hours ago

Dave C

113

asked 7 hours ago

Dave C

113

New contributor

Dave C is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.

$begingroup$
Thanks @Ludisposed. Can you tell me what difference it will be making here? The results seem to come out as anticipated and I'm hesitant to stop the loop as I'll have to start over again.
$endgroup$
– Dave C
6 hours ago

$begingroup$
Ah yes, sorry @Ludisposed that was just a formatting error when writing the question. I've corrected it now. It's running at the moment, I just wondered if there was any way to speed up the loop.
$endgroup$
– Dave C
6 hours ago

add a comment |

$begingroup$
Thanks @Ludisposed. Can you tell me what difference it will be making here? The results seem to come out as anticipated and I'm hesitant to stop the loop as I'll have to start over again.
$endgroup$
– Dave C
6 hours ago

$begingroup$
Ah yes, sorry @Ludisposed that was just a formatting error when writing the question. I've corrected it now. It's running at the moment, I just wondered if there was any way to speed up the loop.
$endgroup$
– Dave C
6 hours ago

Thanks @Ludisposed. Can you tell me what difference it will be making here? The results seem to come out as anticipated and I'm hesitant to stop the loop as I'll have to start over again.

– Dave C
6 hours ago

Ah yes, sorry @Ludisposed that was just a formatting error when writing the question. I've corrected it now. It's running at the moment, I just wondered if there was any way to speed up the loop.

– Dave C
6 hours ago

add a comment |

0

active

oldest

votes

Your Answer

StackExchange.ifUsing("editor", function () {
return StackExchange.using("mathjaxEditing", function () {
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
});
});
}, "mathjax-editing");

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "196"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: false,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

Dave C is a new contributor. Be nice, and check out our Code of Conduct.

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f211930%2fspeed-up-iteration-for-large-dataset-in-python%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

0

active

oldest

votes

0

active

oldest

votes

Dave C is a new contributor. Be nice, and check out our Code of Conduct.

draft saved

draft discarded

Dave C is a new contributor. Be nice, and check out our Code of Conduct.

Thanks for contributing an answer to Code Review Stack Exchange!

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

Use MathJax to format equations. MathJax reference.

To learn more, see our tips on writing great answers.

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

SHAv EvkxLfcBdlwuAM

搜尋此網誌

Ytdyklly