- ahmad alhayek
Write a C++ code to count the frequency of each kmer in given DNA sequence. Steps: 1. Get the kmer size from the user. Should Hint: 1. Use the following code to read the fasta file. #define NOMINMAX #include #include #inc } 2. In above code, to access specific location in the sequence, say location 10, simply deal with it as array of char such a Done seq1.fasta_1eb46f874f1ab58... >ido ACATGACTAGCTACGATCGACTAGCTACGATCGACTAGCT ACGATCGACTACTAGCTACGATCGACTATCAGCTATCGAC TAC Show transcribed image text View comments (1) Expert Answerinformation icon Anonymous's Avatar Anonymous answered this Was this answer helpful? Thumbs up inactive0Thumbs down inactive0 9 answers //main.cpp #include "frequency.h" int main(int argc, char* argv[]) { if (argc < 4) { cerr << "Usage: " << argv[0] << " \n"; cerr << "(*optional) \n\n"; cerr << "(*) NOTE: Last argument is optional. If you want to list the kmers\n"; cerr << "that are not in the top n frequent k-mers, but have the same frequency\n"; cerr << "of the last kmers in the top list select this option.\n"; cerr << "\t(Y)es : Include last k-mers\n\t(N)o : Do not include last k-mers\n"; cerr << "Please note that even if this option is selected there might be no\n"; cerr << "more k-mers to be listed.\n"; return 0; } // Basic input controls if (atoi(argv[2]) < 1) { cerr << "Invalid k-mer size.\n"; return 0; } if (atoi(argv[3]) < 1) { cerr << "Invalid number of top k-mers.\n"; return 0; } Frequency freq(argv[1], atoi(argv[2]), atoi(argv[3]), (argv[4] == NULL) ? 'N' : argv[4][0> freq.run(); return 1; } //frequency.cpp /** Frequency class implementation **/ #include "frequency.h" #include "evaltime.h" Frequency::Frequency(const char* fname, const size_t &ksize, const int &knum, const char &includeLast): filename(fname), kmer_size(ksize), top_kmer_num(knum), cntLastFrq(0) { includeLastKmer = (toupper(includeLast) == 'Y') ? true : false; #if (OUTPUT_TO_FILE) outFile.open("out.txt"); #endif } Frequency::~Frequency() { #if (OUTPUT_TO_FILE) outFile.close(); #endif } vecPair Frequency::sortKmers() { start = GetTimeMs64(); frqMultiMap kmer_swapped; try { // Swaps the keys and values of kmer map, sorting the elements according to their values. O(nlogn) for (auto it = kmerFrq.begin(); it != kmerFrq.end(); ++it) { kmer_swapped.insert(pair(it -> second, it -> first)); } // Clear the containers that are not used anymore. kmerFrq.clear(); } catch (const std::bad_alloc& e) { cerr << "Swapping key-value pairs. Allocation failed: " << e.what() << "\n"; kmerFrq.clear(); return vecPair(); } auto rev_end = kmer_swapped.rbegin(); try { // Takes the most frequent n elements from the end of the map which is sorted in ascending order. advance(rev_end, top_kmer_num); } catch (const std::bad_alloc& e) { cerr << "Getting most frequent elements. Allocation failed: " << e.what() << "\n"; return vecPair(); } vecPair kmerVector; try { // Copies the top n kmers from the end of the multimap to a vector in descending order. for (auto rev = kmer_swapped.rbegin(); rev != rev_end; ++rev) { kmerVector.push_back(make_pair(rev->second, rev->first)); } } catch (const std::bad_alloc& e) { cerr << "Copying to vector. Allocation failed: " << e.what() << "\n"; kmerVector.clear(); return vecPair(); } // If user selected to view the kmers that are not on the top list but have // enough frequency to be in the top list. if (includeLastKmer) { for (auto rev = rev_end; rev != kmer_swapped.rend(); ++rev, ++cntLastFrq) { // Add these kmers to the top frequency list if (rev->first == kmerVector.back().second) { kmerVector.push_back(make_pair(rev->second, rev->first)); } else { break; } } } // Clear the containers that are not used anymore. kmer_swapped.clear(); cout << "kmer sort time: " << GetTimeMs64() - start << " ms. \n"; // Write the result to the screen and file. cout << "Top " << top_kmer_num << " frequent k-mers of size " << kmer_size << " and their frequencies:\n\n"; #if (OUTPUT_TO_FILE) outFile << "\nTop " << top_kmer_num << " frequent k-mers of size " << kmer_size << " and their frequencies:\n\n"; #endif for (size_t i = 0; i < kmerVector.size(); ++i) { cout.fill(' '); cout.width(2); cout << (i+1) << ". " << kmerVector[i].first << " : " << kmerVector[i].second << "\n"; #if (OUTPUT_TO_FILE) // Print to file outFile.fill(' '); outFile.width(2); outFile << (i+1) << ". " << kmerVector[i].first << " : " << kmerVector[i].second << "\n"; #endif } if (includeLastKmer) { cout << "\n* " << cntLastFrq << " more kmer(s) are also listed with the same frequencies of the last kmer(s)\n"; #if (OUTPUT_TO_FILE) outFile << "\n* " << cntLastFrq << " more kmer(s) are also listed with the same frequencies of the last kmer(s)\n"; #endif } return kmerVector; } vecPair Frequency::sortKmersEncoded() { start = GetTimeMs64(); encodedMultiMap kmer_swapped; try { // Swaps the keys and values of kmer map, sorting the elements according to their values. O(nlogn) for (auto it = mapFrq.begin(); it != mapFrq.end(); ++it) { kmer_swapped.insert(pair(it -> second, it -> first)); } // Clear the containers that are not used anymore. mapFrq.clear(); } catch (const std::bad_alloc& e) { cerr << "Swapping key-value pairs. Allocation failed: " << e.what() << "\n"; mapFrq.clear(); return vecPair(); } auto rev_end = kmer_swapped.rbegin(); try { // Takes the most frequent n elements from the end of the map which is sorted in ascending order. advance(rev_end, top_kmer_num); } catch (const std::bad_alloc& e) { cerr << "Getting most frequent elements. Allocation failed: " << e.what() << "\n"; return vecPair(); } vecPair kmerVector; try { // Copies the top n kmers from the end of the multimap to a vector in descending order. for (auto rev = kmer_swapped.rbegin(); rev != rev_end; ++rev) { // The decoded kmers are encoded while copying them. kmerVector.push_back(make_pair(decoder(rev->second), rev->first)); } } catch (const std::bad_alloc& e) { cerr << "Copying to vector. Allocation failed: " << e.what() << "\n"; kmerVector.clear(); return vecPair(); } // If user selected to view the kmers that are not on the top list but have // enough frequency to be in the top list. if (includeLastKmer) { for (auto rev = rev_end; rev != kmer_swapped.rend(); ++rev, ++cntLastFrq) { // Decode these kmers and add them to the top frequency list if (rev->first == kmerVector.back().second) { kmerVector.push_back(make_pair(decoder(rev->second), rev->first)); } else { break; } } } // Clear the containers that are not used anymore. kmer_swapped.clear(); cout << "kmer sort time: " << GetTimeMs64() - start << " ms. \n"; // Write the result to the screen and file. cout << "Top " << top_kmer_num << " frequent k-mers of size " << kmer_size << " and their frequencies:\n\n"; #if (OUTPUT_TO_FILE) outFile << "\nTop " << top_kmer_num << " frequent k-mers of size " << kmer_size << " and their frequencies:\n\n"; #endif for (size_t i = 0; i < kmerVector.size(); ++i) { cout.fill(' '); cout.width(2); cout << (i+1) << ". " << kmerVector[i].first << " : " << kmerVector[i].second << "\n"; #if (OUTPUT_TO_FILE) // Print to file outFile.fill(' '); outFile.width(2); outFile << (i+1) << ". " << kmerVector[i].first << " : " << kmerVector[i].second << "\n"; #endif } if (includeLastKmer) { cout << "\n* " << cntLastFrq << " more kmer(s) are also listed with the same frequencies of the last kmer(s)\n"; #if (OUTPUT_TO_FILE) outFile << "\n* " << cntLastFrq << " more kmer(s) are also listed with the same frequencies of the last kmer(s)\n"; #endif } return kmerVector; } int Frequency::findKmersEncoded(ifstream &in) { start = GetTimeMs64(); string line; // Get the second line which is the first dna sequence getline(in, line); getline(in, line); // In this case, line counter starts from the second line uintmax_t cnt_lines = 2; // Checks if the kmer size is smaller than the sequence length. if (kmer_size > line.length()) { cerr << "Error! k-mer size is larger than the sequence length. Exiting ...\n"; exit(0); } // Calculate the kmer number in a single dna sequence int kmer_num = line.length() - kmer_size + 1; // Finds the kmers of the first dna sequence and count their frequencies O(kmer_num + kmer_size) for (int i = 0; i < kmer_num; ++i) { ++mapFrq[encoder(line.substr(i, kmer_size))]; } while (getline(in, line)) { // Get the second line (dna sequence) of each nucleotide if (++cnt_lines % 4 == 2) { // Find the kmers of the remaining dna sequences and count their frequencies for (int i = 0; i < kmer_num; ++i) { ++mapFrq[encoder(line.substr(i, kmer_size))]; } } } cout << "kmer find time: " << GetTimeMs64() - start << " ms. \n"; return 1; } int Frequency::findKmers(ifstream &in) { start = GetTimeMs64(); string line; // Get the second line which is the first dna sequence getline(in, line); getline(in, line); // In this case, line counter starts from the second line uintmax_t cnt_lines = 2; // Checks if the kmer size is smaller than the sequence length. if (kmer_size > line.length()) { cerr << "Error! k-mer size is larger than the sequence length. Exiting ...\n"; exit(0); } cout << "k-mer size is larger than " << KMER_LIMIT << ". Encoding will not be used.\n"; // Calculate the kmer number in a single dna sequence int kmer_num = line.length() - kmer_size + 1; // Finds the kmers of the first dna sequence and count their frequencies O(kmer_num + kmer_size) for (int i = 0; i < kmer_num; ++i) { ++kmerFrq[line.substr(i, kmer_size)]; } while (getline(in, line)) { // Get the second line (dna sequence) of each nucleotide if (++cnt_lines % 4 == 2) { // Find the kmers of the remaining dna sequences and count their frequencies for (int i = 0; i < kmer_num; ++i) { ++kmerFrq[line.substr(i, kmer_size)]; } } } cout << "kmer find time: " << GetTimeMs64() - start << " ms. \n"; return 1; } int Frequency::run() { ifstream fastq_file(filename); // Set the buffer size manually fastq_file.rdbuf()->pubsetbuf(_buffer, BufferSize); if (fastq_file.is_open()) { cout << "Input file " << filename << " opened for extracting k-mers.\n"; if (kmer_size <= KMER_LIMIT) findKmersEncoded(fastq_file); else findKmers(fastq_file); fastq_file.close(); } else { cerr << "Unable to open file" << endl; return 0; } // Print the top kmers if (kmer_size <= KMER_LIMIT) sortKmersEncoded(); else sortKmers(); return 1; } // Encoder function that returns the binary representation of // the k-mers in unsigned integer format inline uint64_t Frequency::encoder(string kmer) { uint64_t code = 0; for (size_t i = 0; i < kmer.size(); ++i) { code *= 4; switch(kmer[i]) { case 'A': code += SEQ_A; break; case 'C': code += SEQ_C; break; case 'G': code += SEQ_G; break; case 'T': code += SEQ_T; break; } } return code; } // Decoder function that returns the string represenations of // the binary encoded k-mers inline string Frequency::decoder(uint64_t code) { string seq; for (size_t i = 0; i < kmer_size; ++i) { switch(code & SHIFT_MASK) { case 0: seq = "A" + seq; break; case 1: seq = "C" + seq; break; case 2: seq = "G" + seq; break; case 3: seq = "T" + seq; break; } code >>= 2; } return seq; } //frequency.h /**************************************************************** DESCRIPTION: Class to calculate the top n frequencies of k-mers of size k from a given FASTQ file. AUTHOR: Mert TAS ******************************************************************/ #ifndef FREQUENCY_H #define FREQUENCY_H // Output of the program (top frequent kmers) is written to a file #define OUTPUT_TO_FILE 1 #include #include // file stream #include #include // map & multimap #include #include // atoi // Mask to decode bits to char #define SHIFT_MASK 0x3 // binary 11 // The maximum kmer length that could be encoded #define KMER_LIMIT 32 using namespace std; // unsorted map with uint64_t encoded k-mers typedef unordered_map encodedMap; typedef multimap encodedMultiMap; typedef vector > vecPair; // unsorted map with no encoding typedef unordered_map frqMap; typedef multimap frqMultiMap; enum BinaryBases { SEQ_A = 0x0, // binary: 00 SEQ_C = 0x1, // binary: 01 SEQ_G = 0x2, // binary: 10 SEQ_T = 0x3, // binary: 11 }; class Frequency { public: Frequency(const char* fname, const size_t &ksize, const int &knum, const char &includeLast = 'N'); ~Frequency(); inline uint64_t encoder(string kmer); inline string decoder(uint64_t code); int findKmersEncoded(ifstream &in); int findKmers(ifstream &in); vecPair sortKmersEncoded(); vecPair sortKmers(); int run(); private: // Manually set buffer size for the ifstream // A larger buffer size (as far as the system can handle) is expected to // speed up file reading by limiting interaction with the hard disk drive // and the number of system calls. enum { BufferSize = 1024 //1048576 // 2^20 // 1024 // 16384 }; char _buffer[BufferSize]; const char* filename; size_t kmer_size; int top_kmer_num; // Counts the frequency of k-mers encodedMap mapFrq; frqMap kmerFrq; // If user selects to view the kmers that are not in the top list // but have the same frequency with the last kmer in the list bool includeLastKmer; int cntLastFrq; // Timer start value uint64_t start; #if (OUTPUT_TO_FILE) ofstream outFile; #endif }; #endif // FREQUENCY_H //evaltime.h /* Time Measuring Function by Andreas Bonini stackoverflow.com/questions/1861294/how-to-calculate-execution-time-of-a-code-snippet-in-c */ #ifdef _WIN32 #include #else #include #include #endif /* Remove if already defined */ typedef long long int64; typedef unsigned long long uint64; /* Returns the amount of milliseconds elapsed since the UNIX epoch. Works on both * windows and linux. */ uint64 GetTimeMs64() { #ifdef _WIN32 /* Windows */ FILETIME ft; LARGE_INTEGER li; /* Get the amount of 100 nano seconds intervals elapsed since January 1, 1601 (UTC) and copy it * to a LARGE_INTEGER structure. */ GetSystemTimeAsFileTime(&ft); li.LowPart = ft.dwLowDateTime; li.HighPart = ft.dwHighDateTime; uint64 ret = li.QuadPart; ret -= 116444736000000000LL; /* Convert from file time to UNIX epoch time. */ ret /= 10000; /* From 100 nano seconds (10^-7) to 1 millisecond (10^-3) intervals */ return ret; #else /* Linux */ struct timeval tv; gettimeofday(&tv, NULL); uint64 ret = tv.tv_usec; /* Convert from micro seconds (10^-6) to milliseconds (10^-3) */ ret /= 1000; /* Adds the seconds (10^0) after converting them to milliseconds (10^-3) */ ret += (tv.tv_sec * 1000); return ret; #endif } HOW TO USE rootākali:-/test/kmer-counter# gu main.cpp frequency.cpp root@kali:-/test/kmer-counter# ./a.out dna 2 1000 Input file dna operootakali:-/test/kmer-counter# ./a.out dna 4 1000 Input file dna opened for extracting k-mers. kmer find time: 0 ms. kmer sor dna file CACATCGTGGCTCCCGTCTGCTGAGTTGGGCAAAACAGATTGCGTGTAGGTCTAATATG CATCCCAAGCTTTCGCTGTGGCGGCTTGATACATTGAGAGCGGGGTGTAAGATAGTAGAC TAAGATGTCAGTTGCAGAGCGTTAGAGTTTGGCACCAGTGTGGGGTCCGCTAGCAAACCT CGTGCCATGATAAAGTAAGGTTGTTGATCTGACCTATTTCGGTTATGGCCTTGCCAAGTA TCGCATCGAGGGTCAATACCAAACAAGGTAGACACAAACGCTATTGTTACAAGTCTAAGG TCAAACGTTGGCTTACTGGAGTAACGCACGCATGCGTCGCGTCCTCCCAGCTGAAACAGG CAAATGTTAACAAGGGAAAACAGGACTTAATCAGCGGCGTAACACACTACCTAATAACGG TCGGACGATTTAAGCGGTCCGCCACGTACGGAGAACAAAACTATATTTTGCTGGCTAATC GTGGTAACCGCGATCAGGATAAGCAGATCTCGGCCCAGCAGGATCGGTTGGAGTTCTCGC TCGCCGAGCCATTTGTCCATAGTGCACTAACCGCACCGAAAGTCTCGAATGCAGGACTCG GGTTGAATTTAGACACGCAACGTGACCTGCGTGTTATCTCGTACATTCTTTTTACACACG TTAGTATCTCCTGGCACCGTGACTCGCTAGTGTGTCAGAACATATGAAATTCACACCCGT CCACCGCATTCTCGAACCAGCTCATACGTTATCATATATGTACCTCCAATGGGCCATATG GACGGGACATAACACCTGCTCTCAAGGCCATGGTCGTCGTTGCGATTTGTACGCCTAATC AGTAATCCTTCTCCGCGCAGATCCCAATTTACGTCCTACGCTCTTTCATCCGCGTTTTTA ACACAATATTACTCACGTGAGAGTGAAGCCACCGGTCGCCCGTCGGGTCCCACCGTACGT GTAGACCCGCTCCGCCTGTTCAGCAAAAAGACCGAGTAAA Comment

هناك تعليقان (2):

  1. https://www.chegg.com/homework-help/questions-and-answers/somenone-explain-im-lost-got-i3-mesh-analysis-j045-i3-i2-j-i3-i2-2-i3-0-photos-work-chrono-q43323047?fbclid=IwAR0ZcAs2J1itXSbwmJ5xITyUkGcXHFv5t_rBX2I9asKkIxtuF_mNmOtsbKw

    ردحذف
  2. https://www.chegg.com/homework-help/questions-and-answers/write-program-declares-array-matrix-4-4-type-double-initialize-array-components-first-two--q65912663?fbclid=IwAR3SVydYQ8ymdSiAnwIfe3RmQ6VxURxzvDAdvwNbNaNCZXCaQYD5Uyj7Vi8

    ردحذف

ahmad alhayek تصميم ahmad alhayek جميع الحقوق محفوظة 2016

صور المظاهر بواسطة sndr. يتم التشغيل بواسطة Blogger.