diff --git a/.build.sh b/.build.sh index 65984a2..7ff3130 100755 --- a/.build.sh +++ b/.build.sh @@ -35,8 +35,11 @@ EOF # cd +rm -rf kmerdb-0.7*dist-info/ kmerdb.egg-info/ build/ + python -m build auditwheel repair --plat manylinux2014_x86_64 dist/kmerdb-*linux_x86_64.whl mv wheelhouse/* dist rm dist/kmerdb-*linux_x86_64.whl + diff --git a/.clean_local_install.sh b/.clean_local_install.sh index ac6c397..6922ea4 100755 --- a/.clean_local_install.sh +++ b/.clean_local_install.sh @@ -18,9 +18,9 @@ EOF -rm -rf /home/matt/.pyenv/versions/kdb/lib/python3.10/site-packages/kmerdb /home/matt/.pyenv/versions/kdb/lib/python3.10/site-packages/kmerdb-*.egg-info /ffast2/kdb/kmerdb.egg-info /ffast2/kdb/build /ffast2/kdb/dist -rm -rf /home/matt/.pyenv/versions/3.10.1/envs/kdb/lib/python3.10/site-packages/kmerdb-* -cd /ffast2/kdb/ +rm -rf /home/matt/.pyenv/versions/kdb/lib/python3.11/site-packages/kmerdb /home/matt/.pyenv/versions/kdb/lib/python3.11/site-packages/kmerdb-*.egg-info /home/matt/Projects/kdb/kmerdb.egg-info /home/matt/Projects/kdb/build /home/matt/Projects/kdb/dist +rm -rf /home/matt/.pyenv/versions/3.11.7/envs/kdb/lib/python3.11/site-packages/kmerdb-* +cd /home/matt/Projects/kdb/ rm -rf dist build kmerdb.egg-info wheelhouse cd diff --git a/.gitignore b/.gitignore index 5a4bcf1..d1e3080 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,5 @@ kdb/__pycache__ **.kdb examples/example_report/*.png examples/example_report -test/data \ No newline at end of file +test/data +pypi_token.foo diff --git a/.release.sh b/.release.sh new file mode 100644 index 0000000..b662a67 --- /dev/null +++ b/.release.sh @@ -0,0 +1,22 @@ +#!/bin/bash +cat < A Python CLI and module for k-mer profiles, similarities, and graph databases -NOTE: This project is in alpha stage. Development is ongoing. But feel free to clone the repository and play with the code for yourself. +NOTE: This project is in beta stage. Development is ongoing. But feel free to clone the repository and play with the code for yourself. ## Development Status [![Downloads](https://static.pepy.tech/personalized-badge/kmerdb?period=total&units=international_system&left_color=grey&right_color=brightgreen&left_text=Downloads)](https://pypi.org/project/kmerdb) @@ -20,12 +20,12 @@ NOTE: This project is in alpha stage. Development is ongoing. But feel free to c ## Summary -KDB is a Python library designed for bioinformatics applications. It addresses the ['k-mer' problem](https://en.wikipedia.org/wiki/K-mer) (substrings of length k) in a simple and performant manner. It stores the k-mer counts/abundances and total counts. You can think of the current form as a "pre-index", as it includes all the essential information for indexing on any field in the landscape of k-mer to sequence relationships. One restriction is that k-mers with unspecified sequence residues 'N' create gaps in the k-mer to sequence relationship space, and are excluded. That said, non-standard IUPAC residues are supported. +KDB is a Python library designed for bioinformatics applications. It addresses the ['k-mer' problem](https://en.wikipedia.org/wiki/K-mer) (substrings of length k) in a simple and performant manner. It stores the k-mer counts/abundances in a columnar format, with input file checksums, total counts, nullomers, and mononucleotide counts in a YAML formatted header in the first block of the `bgzf` formatted `.kdb` file. One restriction is that k-mers with unspecified sequence residues 'N' create gaps in the k-mer to sequence relationship space, and are excluded. That said, non-standard IUPAC residues are supported. Please see the [Quickstart guide](https://matthewralston.github.io/kmerdb/quickstart) for more information about the format, the library, and the project. -The k-mer spectrum of the fasta or fastq sequencing data is stored in the `.kdb` format spec, a bgzf file similar to `.bam`. For those familiar with `.bam`, a `view` and `header` functions are provided to decompress a `.kdb` file into a standard output stream. +The k-mer spectrum of the fasta or fastq sequencing data is stored in the `.kdb` format spec, a bgzf file similar to `.bam`. For those familiar with `.bam`, a `view` and `header` functions are provided to decompress a `.kdb` file into a standard output stream. The output file is compatible with `zlib`. @@ -79,23 +79,37 @@ See `python -m kmerdb -h` for details. ```bash kmerdb --help -# Build a [composite] profile to a new or existing .kdb file +# Build a [composite] profile to a new .kdb file kmerdb profile -k 8 example1.fq.gz example2.fq.gz profile.8.kdb +# Note: zlib compatibility +zcat profile.8.kdb + +# Build a weighted edge list +kmerdb graph -k 12 example1.fq.gz example2.fq.gz edges.kdbg + # View the raw data kmerdb view profile.8.kdb # -H for full header # View the header kmerdb header profile.8.kdb -# Collate the files -kmerdb matrix -p $cores pass *.8.kdb +# Collate the files. See 'kmerdb matrix -h' for more information. +# Note: the 'pass' subcommand passes the int counts through collation, without normalization. +# In this case the shell interprets '*.8.kdb' as all 8-mer profiles in the current working directory. +# The k-mer profiles are read in parallel (-p $cores), and collated into one Pandas dataframe, which is printed to STDOUT. +# Other options include DESeq2 normalization, frequency matrix, or PCA|tSNE based dimensionality reduction techniques. +kmerdb matrix -p $cores pass *.8.kdb > kmer_count_dataframe.tsv # Calculate similarity between two (or more) profiles -kmerdb distance correlation profile1.kdb profile2.kdb (...) +# The correlation distance from Numpy is used on one or more profiles, or piped output from 'kmerdb matrix'. +kmerdb distance correlation profile1.kdb profile2.kdb (...) > distance.tsv + +# A condensed, one-line invocation of the matrix and distance command using the bash shell's pipe mechanism is as follows. +kmerdb matrix pass *.8.kdb | kmerdb distance correlation STDIN > distance.tsv ``` -## Usage note: +## IUPAC support: ```bash kmerdb profile -k $k input.fa output.kdb # This may discard non-IUPAC characters, this feature lacks documentation! @@ -132,9 +146,9 @@ python setup.py test ## License -Created by Matthew Ralston - [Scientist, Programmer, Musician](http://matthewralston.github.io) - [Email](mailto:mrals89@gmail.com) +Created by Matthew Ralston - [Scientist, Programmer, Musician](http://matthewralston.github.io) - [Email](mailto:mralston.development@gmail.com) -Distributed under the Apache license. See `LICENSE.txt` for the copy distributed with this project. Open source software is not for everyone, but for those of us starting out and trying to put the ecosystem ahead of ego, we march into the information age with this ethos. +Distributed under the Apache license. See `LICENSE.txt` for the copy distributed with this project. Open source software is not for everyone, but we march into the information age with this ethos. I have the patent rights to this software. You may use and distribute this software, gratis, so long as the original LICENSE.txt is distributed along with the software. This software is distributed AS IS and provides no warranties of any kind. ``` Copyright 2020 Matthew Ralston @@ -154,7 +168,7 @@ Distributed under the Apache license. See `LICENSE.txt` for the copy distributed ## Contributing -1. Fork it () +1. Fork it () 2. Create your feature branch (`git checkout -b feature/fooBar`) 3. Commit your changes (`git commit -am 'Add some fooBar'`) 4. Push to the branch (`git push origin feature/fooBar`) @@ -164,22 +178,24 @@ Distributed under the Apache license. See `LICENSE.txt` for the copy distributed Thank you to the authors of kPAL and Jellyfish for the early inspiration. And thank you to others for the encouragement along the way, who shall remain nameless. I wanted this library to be a good strategy for assessing these k-mer profiles, in a way that is both cost aware of the analytical tasks at play, capable of storing the exact profiles in sync with the current assemblies, and then updating the kmer databases only when needed to generate enough spectral signature information. -The intention is that more developers would want to add functionality to the codebase or even just utilize things downstream, but to build out directly with numpy and scipy/scikit as needed to suggest the basic infrastructure for the ML problems and modeling approaches that could be applied to such datasets. This project has begun under GPL v3.0 and hopefully could gain some interest. +The intention is that more developers would want to add functionality to the codebase or even just utilize things downstream, but to build out directly with numpy and scipy/scikit as needed to suggest the basic infrastructure for the ML problems and modeling approaches that could be applied to such datasets. This project began under GPL v3.0 and was relicensed with Apache v2. Hopefully this project could gain some interest. I have so much fun working on just this one project. There's more to it than meets the eye. I'm working on a preprint, and the draft is included in some of the latest versions of the codebase, specifically .Rmd files. More on the flip-side of this file. Literally. And figuratively. It's so complex with technology these days. diff --git a/RELEASE_NOTES.txt b/RELEASE_NOTES.txt new file mode 100644 index 0000000..f23c225 --- /dev/null +++ b/RELEASE_NOTES.txt @@ -0,0 +1,47 @@ +============= +| v0.7.8 | +============= +Still fixing a lot of issue wrt the interface (logging info) and stepping through neighbor list creation/validations, adjacencies format specification, other issues w.r.t main __init__.py profile and graph subcommands (profile, make_kdbg in __init__.py, [ALL] fileutil.py ) + +Major issues. ick. + + + + +============= +| v0.7.7 | +============= +New basic format spec (.kdbg) released for weighted edge list. IUPAC warning setting abstracted for two base methods for k-mer counter: validate_seqRecoard_and_detect_IUPAC. _shred method needed for edge list, performance assessment needed vs vanilla k counter. + +graph format has 3 numpy columns: 2,3,4 and those are the n1, n2, edge_weight vars. added to metadata config. solver solution still unformed. + +Needs README.md and website description. + + +============= +| v0.7.6 | +============= +The tabular format specification has boiled down to a 4 or 5 column design, and the metadata header has been stabilized since 0.7.4, in Jan/Feb of 2023. The header now consists of explicit Numpy dtypes, int64 most of the time. Frequency columns are included for the sake of it, but int count profiles have taken the front seat in the project. + +The columnar format is now: rownum, kmerid, count, frequency. 'Metadata' i.e. the 6 neighboring k-mer ids, is completely optional, and very much still in alpha. The scipy and biopython kmeans and hierarchical clustering features have been briefly tested, and the numpy distances now form the core of the distance command. + +I'm most proud of the profile and matrix commands, the latter may read profiles into memory in parallel, collating the count column as it goes. I'm not sure how this would perform on the sorted .kdb files. + +Minor bug fixes and regressions on the fileutil and __init__.py files round out 0.7.6 from 0.7.4. Basically reduces smell and tests the --sorted feature. The --re-sort and --un-sort features on the view command remain a little too untested... + +============= +| v0.6.5 | +------------- +The numerical backbone of the project has been solidified, more sanity checks and assertions throughout runtime. The memory tends to be an issue even for mild choices of k. We are now using 'uint64' and 'float64' for indexes, counts, and frequencies. Parallelization has been improved in the matrix command for 'quick' loading of count profiles into memory. Currently KDBReader is lazy load, only reading the header metadata when file is first opened. Behavior other than the 'slurp()' and '_slurp' methods are decided only by the source and Bio.bgzf module. In principle, you could read the file line-by-line if you wanted to, but the behavior is sufficient at the moment for acceptance testing. + +In addition to these 'features' my focus has been mostly focused on getting the ideal Pearson and Spearman correlation coefficients to understand profile fidelity behavior. + + +============= +| v0.0.7 | +------------- +There have been 3 pre-releases in the codebase thus far, and we are on version number 0.0.7. The codebase has changed into a sophisticated on-disk k-mer counting strategy, with multiple parallelization options. The first of which is native OS parallelism using something like GNU parallels to run the program on many genomes or metagenomes, simultaneously. The second parallelization option use the Python3 multiprocessing library, particularly for processing fastq datasets. + +When I say on-disk, I mean the file format I've created is essentially an indexed database. This allows rapid access to sequential transition probabilities, on-disk, during a random walk representing a Markov process. This is a key feature for anyone who wants to seriously traverse a 4k dimensional space. + +The codebase also currently contains a randomization feature, distance matrices, arbitrary interoperability with Pandas dataframes, clustering features, normalization, standardization, and dimensionality reduction. The suite is currently ready for a regression feature that I've promised, and I'd like to implement this early this Spring. Next, I'd be interested in working on the following features that would make the suite ready for another beta release. diff --git a/TODO.org b/TODO.org index 7274634..fb7e2c3 100644 --- a/TODO.org +++ b/TODO.org @@ -7,6 +7,63 @@ +* 3/25/24 - finished weighted edge list, planning assembler +** Personal Remarks +*** Today marks the beginning of the end... of the DeBruijn graph format pull-request from branch 'graph_algo' +*** I'm doing a little bit better mentally. Learned today about non-stiumlant ADHD meds +*** In hindsight, I've never been diagnosed with ADHD. I have reasonable hyper-focus, but I get derailed with alternate versions of ... oops I literally forgot what the psychiatrist calls it when you change tasks and get unfocused. Wow. +*** I like my new therapist/counselor and her level of care seems nice. Let's see how the next 3 months goes. +*** Okay, that's enough about meTM. +** Project remarks +*** I'm very happy with the recent additions to the the graph_algo branch. The feature 'seems' to be working quite well regarding neighboring/subsequent k-mers appended to the id array. +*** Specifically, I have a --quiet option that will silence most output delivered to the screen in addition to the verbosity setting. +*** By DEFAULT I print an obnoxious amount of output to the STDERR stream, without the verbosity settings changed from the default of warning level (-v, -vv). +*** I believe this demonstrates to the user how adjacencies in the id array are considered aka that they have the k-1 subsequence in common. +*** These assertions introduced in kmerdb.graph are essential to verify that subsequent read counts, propagate an error, which is displayed to the user as a warning +*** describing the nature of the assertion failures and suggesting the reason why. +*** More specifically: it should be added to the README.md that the number of assertion failures should roughly equal the number of reads in a .fq file, triggering the issue via k-mer ids from the end of one read and the beginning of the next. + +NOTE: ADJACENCY ERRORS DETECTED: Found 24999 'improper' k-mer pairs/adjacencies from the input file(s), + where subsequent k-mers in the k-mer id array (produced from the sliding window method over input seqs/reads) did not share k-1 residues in common. + These *may* be introduced in the array from the last k-mer of one seq/read (in the .fa/.fq) and the first k-mer of the next seq/read. +*** Okay, with this settled, I can now describe any plans for revision to the .kdbg format, as well as a description of a first-pass networkx based solution to graph traversal and stop criterion during contig generation. +*** With that said, I absolutely need a visualizer at this point to check my work. +** TODO Code cleanup +*** Documentation +**** Deprecations +***** strand_specific +***** all_metadata +**** IUPAC +**** README +*** kmerdb module + - [X] kmer.py + - [ ] verbose => quiet + - [X] graph.py + - [X] parse.py + - [ ] __init__.py +*** README.md + - [ ] README.md + - [ ] Document the *new* IUPAC strategy for 'kmerdb.kmer._shred_for_graph' + - [ ] Provide +*** website - matthewralston.github.io/kmerdb + - [/] Expanded documentation on subcommands. + - [ ] profile + - [ ] view + - [ ] distance (SWAP ORDER) + - [ ] matrix (SWAP ORDER) + - [ ] NEW! graph + - [ ] kmeans + - [ ] hierarchical + - [ ] probability + - [ ] DONT DO YET graph/assembly page + - [/] API + - [ ] reading .kdbg or .kdb files + - [ ] writing .kdbg or .kdb files +** TODO Assembly algorithm planning +** TODO CPU (NetworkX) implementation (overview) +** TODO Stop criterion + - [ ] when are the *necessary* traversals are completed + - [ ] How do we rank these? * Lost comments @@ -22,9 +79,8 @@ ** Remembering that it's only supposed to be a k-mer count vector storage medium right now ** Scoping scoping where does it end ** Is my life's work pointless? -** Losing my best friend because of relapse -*** Sent 1 basic sorry, got an acknowledgement. -*** Felt like shit, lost my fucking wallet and chillum +** Losing my best friend because of argument +*** Sent 1 basic sorry, got an minor acknowledgement. *** Smoking habit down to 1 cig a day (just bored, less and less dynamism of focus. *** Recalling the CortizoneTM *** Apply gently @@ -34,7 +90,7 @@ ** Time/money management issues mounting * Code maintenance -** TODO COMMENTS [7/7] +** FEEDBACK COMMENTS [7/7] DEADLINE: <2022-01-29 Sat> SCHEDULED: <2022-01-27 Thu> - [X] util - [X] merge_metadata_lists [3/3] diff --git a/kmerdb/__init__.py b/kmerdb/__init__.py index 3f1e37a..a434e3b 100644 --- a/kmerdb/__init__.py +++ b/kmerdb/__init__.py @@ -95,7 +95,7 @@ def markov_probability(arguments): """ import pandas as pd import numpy as np - from kmerdb import fileutil, index, probability, seqparser + from kmerdb import fileutil, index, probability, parse from kmerdb.config import DONE if os.path.splitext(arguments.kdb)[-1] != ".kdb": @@ -109,7 +109,7 @@ def markov_probability(arguments): with fileutil.open(arguments.kdb, 'r', slurp=True) as kdb: k = kdb.metadata['k'] with index.open(arguments.kdbi, 'r') as kdbi: - with seqparser.SeqParser(arguments.seqfile, arguments.fastq_block_size, k) as seqprsr: + with parse.SeqParser(arguments.seqfile, arguments.fastq_block_size, k) as seqprsr: recs = [r for r in seqprsr] if seqprsr.fastq: logger.debug("Read exactly b=={0}=={1} records from the {2} seqparser object".format(b, len(recs), s)) @@ -129,7 +129,7 @@ def markov_probability(arguments): else: np.append(profiles, markov_probs, axis=0) - recs = [r for r in seqprsr] # Essentially accomplishes an iteration in the file, wrapped by the seqparser.SeqParser class + recs = [r for r in seqprsr] # Essentially accomplishes an iteration in the file, wrapped by the parse.SeqParser class df = pd.DataFrame(profiles, columns=["SequenceID", "Log_Odds_ratio", "p_of_seq"]) df.to_csv(sys.stdout, sep=arguments.delimiter, index=False) @@ -906,26 +906,40 @@ def header(arguments): Another end-user function that takes an argparse Namespace object. This function just reads the metadata header, can print in json. """ - from kmerdb import fileutil, config, util + from kmerdb import fileutil, config, util, graph - if os.path.splitext(arguments.kdb)[-1] != ".kdb": - raise IOError("Viewable .kdb filepath does not end in '.kdb'") - - with fileutil.open(arguments.kdb, mode='r') as kdb_in: - if kdb_in.metadata["version"] != config.VERSION: + sfx = os.path.splitext(arguments.kdb)[-1] + metadata = None + + if sfx != ".kdb" and sfx != ".kdbg": # A filepath with invalid suffix + raise IOError("Viewable .kdb(g) filepath does not end in '.kdb' or '.kdbg'") + elif not os.path.exists(arguments.kdb): + raise IOError("Viewable .kdb(g) filepath '{0}' does not exist on the filesystem".format(arguments.kdb_in)) + + if sfx == ".kdb": + kdb = fileutil.open(arguments.kdb, mode='r', sort=arguments.re_sort, slurp=True) + metadata = kdb.metadata + kmer_ids_dtype = metadata["kmer_ids_dtype"] + N = 4**metadata["k"] + if metadata["version"] != config.VERSION: + logger.warning("KDB version is out of date, may be incompatible with current KDBReader class") + assert kdb.kmer_ids.size == N, "view | read kmer_ids size did not match N from the header metadata" + assert kdb.counts.size == N, "view | read counts size did not match N from the header metadata" + assert kdb.frequencies.size == N, "view | read frequencies size did not match N from the header metadata" + metadata = kdb.metadata + elif sfx == ".kdbg": + kdb = graph.open(arguments.kdb, mode='r') + if kdb.metadata["version"] != config.VERSION: logger.warning("KDB file version is out of date, may be incompatible with current fileutil.KDBReader class") - N = 4**kdb_in.metadata["k"] - - assert kdb_in.kmer_ids.size == N, "view | read kmer_ids size did not match N from the header metadata" - assert kdb_in.counts.size == N, "view | read counts size did not match N from the header metadata" - assert kdb_in.frequencies.size == N, "view | read frequencies size did not match N from the header metadata" + N = 4**kdb.metadata["k"] + metadata = kdb.metadata - if arguments.json: - print(dict(kdb_in.metadata)) - else: - yaml.add_representer(OrderedDict, util.represent_ordereddict) - print(yaml.dump(kdb_in.metadata)) - print(config.header_delimiter) + if arguments.json: + print(dict(kdb.metadata)) + else: + yaml.add_representer(OrderedDict, util.represent_yaml_from_collections_dot_OrderedDict) + print(yaml.dump(metadata)) + print(config.header_delimiter) def view(arguments): """ @@ -944,7 +958,7 @@ def view(arguments): """ import numpy as np - from kmerdb import fileutil, config, util, kmer + from kmerdb import fileutil, config, util, kmer, graph import json metadata = None N = None @@ -967,101 +981,454 @@ def get_header(line, header): return header_dict assert type(arguments.kdb_in) is str, "kdb_in must be a str" - if os.path.splitext(arguments.kdb_in)[-1] != ".kdb": # A filepath with invalid suffix - raise IOError("Viewable .kdb filepath does not end in '.kdb'") + sfx = os.path.splitext(arguments.kdb_in)[-1] + + + + + + if sfx != ".kdb" and sfx != ".kdbg": # A filepath with invalid suffix + raise IOError("Viewable .kdb(g) filepath does not end in '.kdb' or '.kdbg'") elif not os.path.exists(arguments.kdb_in): - raise IOError("Viewable .kdb filepath '{0}' does not exist on the filesystem".format(arguments.kdb_in)) - with fileutil.open(arguments.kdb_in, mode='r', sort=arguments.re_sort, slurp=True) as kdb_in: - metadata = kdb_in.metadata - kmer_ids_dtype = metadata["kmer_ids_dtype"] - N = 4**metadata["k"] + raise IOError("Viewable .kdb(g) filepath '{0}' does not exist on the filesystem".format(arguments.kdb_in)) + if sfx == ".kdb": + with fileutil.open(arguments.kdb_in, mode='r', sort=arguments.re_sort, slurp=True) as kdb_in: + metadata = kdb_in.metadata + kmer_ids_dtype = metadata["kmer_ids_dtype"] + N = 4**metadata["k"] + if metadata["version"] != config.VERSION: + logger.warning("KDB version is out of date, may be incompatible with current KDBReader class") + if arguments.kdb_out is None or (arguments.kdb_out == "/dev/stdout" or arguments.kdb_out == "STDOUT"): # Write to stdout, uncompressed + if arguments.header: + yaml.add_representer(OrderedDict, util.represent_ordereddict) + print(yaml.dump(metadata, sort_keys=False)) + print(config.header_delimiter) + logger.info("Reading from file...") + logger.debug("I cut off the json-formatted unstructured column for the main view.") + try: + if not arguments.un_sort and arguments.re_sort and metadata["sorted"] is True: + kmer_ids_sorted_by_count = np.lexsort((kdb_in.counts, kdb_in.kmer_ids)) + reverse_kmer_ids_sorted_by_count = np.flipud(kmer_ids_sorted_by_count) + for i, idx in enumerate(kmer_ids_sorted_by_count): + kmer_id = kdb_in.kmer_ids[i] + logger.debug("The first is an implicit row-index. The second is a k-mer id, then the counts and frequencies.") + logger.debug("{0}\t{1}\t{2}\t{3}".format(i, kmer_id, kdb_in.counts[kmer_id], kdb_in.frequencies[kmer_id])) + + print("{0}\t{1}\t{2}\t{3}".format(i, kmer_id, kdb_in.counts[kmer_id], kdb_in.frequencies[kmer_id])) + else: + for i, idx in enumerate(kdb_in.kmer_ids): + kmer_id = kdb_in.kmer_ids[idx] + logger.debug("The row in the file should follow this order:") + logger.debug("The first is an implicit row-index. The second is a k-mer id, then the counts and frequencies.") + logger.debug("{0}\t{1}\t{2}\t{3}".format(i, kmer_id, kdb_in.counts[kmer_id], kdb_in.frequencies[kmer_id])) + try: + if arguments.un_sort is True: + assert kmer_id == idx, "view | kmer_id {0} didn't match the expected k-mer id.".format(idx, kmer_id) + assert i == kmer_id, "view | kmer_id {0} didn't match the implicit index {1}".format(idx, i) + else: + #logger.debug("Not sorting, so skipping assertion about profile (col1, column 2)") + pass + except AssertionError as e: + logger.warning(e) + logger.warning("K-mer id {0} will be printed in the {1} row".format(idx, i)) + #raise e + logger.debug("{0} line:".format(i)) + logger.debug("=== = = = ======= = = = = = = |") + if arguments.un_sort is True: + print("{0}\t{1}\t{2}\t{3}".format(i, idx, kdb_in.counts[idx], kdb_in.frequencies[idx])) + else: + print("{0}\t{1}\t{2}\t{3}".format(i, idx, kdb_in.counts[kmer_id], kdb_in.frequencies[kmer_id])) + # I don't think anyone cares about the graph representation. + # I don't think this actually matters because I can't figure out what the next data structure is. + # Is it a Cypher query and creation node set? + # I need to demonstrate a capacity for graph based learning. + # (:-|X) The dread pirate roberts got me. + # :) + except BrokenPipeError as e: + logger.error(e) + raise e + if arguments.kdb_out is not None and arguments.compress: # Can't yet write compressed to stdout + logger.error("Can't write kdb to stdout! We need to use a Bio.bgzf filehandle.") + sys.exit(1) + elif arguments.kdb_out is not None and type(arguments.kdb_out) is not str: + raise ValueError("Cannot write a file to an argument that isn't a string") + elif arguments.kdb_out is not None and os.path.exists(arguments.kdb_out): + logger.warning("Overwriting '{0}'...".format(arguments.kdb_out)) + elif arguments.kdb_out is not None and not os.path.exists(arguments.kdb_out): + logger.debug("Creating '{0}'...".format(arguments.kdb_out)) + if arguments.kdb_out is not None: + with fileutil.open(arguments.kdb_in, 'r', dtype=suggested_dtype, sort=arguments.sorted, slurp=True) as kdb_in: + assert kdb_in.kmer_ids.size == N, "view | read kmer_ids size did not match N from the header metadata" + assert kdb_in.counts.size == N, "view | read counts size did not match N from the header metadata" + assert kdb_in.frequencies.size == N, "view | read frequencies size did not match N from the header metadata" + with fileutil.open(arguments.kdb_out, metadata=metadata, mode='w') as kdb_out: + try: + for i, idx in enumerate(kdb_in.kmer_ids): + kmer_id = idx + seq = kmer.id_to_kmer(kmer_id, arguments.k) + kmer_metadata = kmer.neighbors(seq, arguments.k) + logger.debug("The first is the actual row id. This is the recorded row-id in the file. This should always be sequential. Next is the k-mer id. ") + kdb_out.write("{0}\t{1}\t{2}\t{3}\n".format(i, kmer_id, kdb_in.counts[kmer_id], kdb_in.frequencies[kmer_id], kmer_metadata)) + + except StopIteration as e: + logger.error(e) + raise e + finally: + #kdb_out._write_block(kdb_out._buffer) + #kdb_out._handle.flush() + #kdb_out._handle.close() + sys.stderr.write(config.DONE) + elif sfx == ".kdbg": + kdbg_in = graph.open(arguments.kdb_in, mode='r', slurp=True) + metadata = kdbg_in.metadata + + n1_dtype = metadata["n1_dtype"] + n2_dtype = metadata["n2_dtype"] + weights_dtype = metadata["weights_dtype"] + + if metadata["version"] != config.VERSION: logger.warning("KDB version is out of date, may be incompatible with current KDBReader class") if arguments.kdb_out is None or (arguments.kdb_out == "/dev/stdout" or arguments.kdb_out == "STDOUT"): # Write to stdout, uncompressed if arguments.header: - yaml.add_representer(OrderedDict, util.represent_ordereddict) + yaml.add_representer(OrderedDict, util.represent_yaml_from_collections_dot_OrderedDict) print(yaml.dump(metadata, sort_keys=False)) print(config.header_delimiter) logger.info("Reading from file...") logger.debug("I cut off the json-formatted unstructured column for the main view.") - try: - if not arguments.un_sort and arguments.re_sort and metadata["sorted"] is True: - kmer_ids_sorted_by_count = np.lexsort((kdb_in.counts, kdb_in.kmer_ids)) - reverse_kmer_ids_sorted_by_count = np.flipud(kmer_ids_sorted_by_count) - for i, idx in enumerate(kmer_ids_sorted_by_count): - kmer_id = kdb_in.kmer_ids[i] - logger.debug("The first is an implicit row-index. The second is a k-mer id, then the counts and frequencies.") - logger.debug("{0}\t{1}\t{2}\t{3}".format(i, kmer_id, kdb_in.counts[kmer_id], kdb_in.frequencies[kmer_id])) - print("{0}\t{1}\t{2}\t{3}".format(i, kmer_id, kdb_in.counts[kmer_id], kdb_in.frequencies[kmer_id])) - else: - for i, idx in enumerate(kdb_in.kmer_ids): - kmer_id = kdb_in.kmer_ids[idx] - logger.debug("The row in the file should follow this order:") - logger.debug("The first is an implicit row-index. The second is a k-mer id, then the counts and frequencies.") - logger.debug("{0}\t{1}\t{2}\t{3}".format(i, kmer_id, kdb_in.counts[kmer_id], kdb_in.frequencies[kmer_id])) - try: - if arguments.un_sort is True: - assert kmer_id == idx, "view | kmer_id {0} didn't match the expected k-mer id.".format(idx, kmer_id) - assert i == kmer_id, "view | kmer_id {0} didn't match the implicit index {1}".format(idx, i) - else: - #logger.debug("Not sorting, so skipping assertion about profile (col1, column 2)") - pass - except AssertionError as e: - logger.warning(e) - logger.warning("K-mer id {0} will be printed in the {1} row".format(idx, i)) - #raise e - logger.debug("{0} line:".format(i)) - logger.debug("=== = = = ======= = = = = = = |") - if arguments.un_sort is True: - print("{0}\t{1}\t{2}\t{3}".format(i, idx, kdb_in.counts[idx], kdb_in.frequencies[idx])) - else: - print("{0}\t{1}\t{2}\t{3}".format(i, idx, kdb_in.counts[kmer_id], kdb_in.frequencies[kmer_id])) + for i in range(len(kdbg_in.n1)): + n1 = kdbg_in.n1[i] + n2 = kdbg_in.n2[i] + w = kdbg_in.weights[i] + logger.debug("The row in the file should follow this order:") + logger.debug("The first is an implicit row-index. The second and third are k-mer ids, then edge weight") + logger.debug("{0}\t{1}\t{2}\t{3}".format(i, n1, n2, w)) + logger.debug("{0} line:".format(i)) + logger.debug("=== = = = ======= = = = = = = |") + print("{0}\t{1}\t{2}\t{3}".format(i, n1, n2, w)) # I don't think anyone cares about the graph representation. # I don't think this actually matters because I can't figure out what the next data structure is. # Is it a Cypher query and creation node set? # I need to demonstrate a capacity for graph based learning. # (:-|X) The dread pirate roberts got me. # :) - except BrokenPipeError as e: - logger.error(e) - raise e - if arguments.kdb_out is not None and arguments.compress: # Can't yet write compressed to stdout - logger.error("Can't write kdb to stdout! We need to use a Bio.bgzf filehandle.") - sys.exit(1) - elif arguments.kdb_out is not None and type(arguments.kdb_out) is not str: - raise ValueError("Cannot write a file to an argument that isn't a string") - elif arguments.kdb_out is not None and os.path.exists(arguments.kdb_out): - logger.warning("Overwriting '{0}'...".format(arguments.kdb_out)) - elif arguments.kdb_out is not None and not os.path.exists(arguments.kdb_out): - logger.debug("Creating '{0}'...".format(arguments.kdb_out)) - if arguments.kdb_out is not None: - with fileutil.open(arguments.kdb_in, 'r', dtype=suggested_dtype, sort=arguments.sorted, slurp=True) as kdb_in: - assert kdb_in.kmer_ids.size == N, "view | read kmer_ids size did not match N from the header metadata" - assert kdb_in.counts.size == N, "view | read counts size did not match N from the header metadata" - assert kdb_in.frequencies.size == N, "view | read frequencies size did not match N from the header metadata" - with fileutil.open(arguments.kdb_out, metadata=metadata, mode='w') as kdb_out: - try: - for i, idx in enumerate(kdb_in.kmer_ids): - kmer_id = idx - seq = kmer.id_to_kmer(kmer_id, arguments.k) - kmer_metadata = kmer.neighbors(seq, arguments.k) - logger.debug("The first is the actual row id. This is the recorded row-id in the file. This should always be sequential. Next is the k-mer id. ") - kdb_out.write("{0}\t{1}\t{2}\t{3}\n".format(i, kmer_id, kdb_in.counts[kmer_id], kdb_in.frequencies[kmer_id], kmer_metadata)) + if arguments.kdb_out is not None and arguments.compress: # Can't yet write compressed to stdout + logger.error("Can't write kdb to stdout! We need to use a Bio.bgzf filehandle.") + sys.exit(1) + elif arguments.kdb_out is not None and type(arguments.kdb_out) is not str: + raise ValueError("Cannot write a file to an argument that isn't a string") + elif arguments.kdb_out is not None and os.path.exists(arguments.kdb_out): + logger.warning("Overwriting '{0}'...".format(arguments.kdb_out)) + elif arguments.kdb_out is not None and not os.path.exists(arguments.kdb_out): + logger.debug("Creating '{0}'...".format(arguments.kdb_out)) + if arguments.kdb_out is not None: + with graph.open(arguments.kdb_out, metadata=metadata, mode='w') as kdb_out: + try: + for i in range(len(kdbg.n1)): + kdb_out.write("{0}\t{1}\t{2}\t{3}\n".format(i, kdbg_in.n1[i], kdbg_in.n2[i], kdbg_in.w[i])) - except StopIteration as e: - logger.error(e) - raise e - finally: - #kdb_out._write_block(kdb_out._buffer) - #kdb_out._handle.flush() - #kdb_out._handle.close() - sys.stderr.write(config.DONE) + except StopIteration as e: + logger.error(e) + raise e + finally: + #kdb_out._write_block(kdb_out._buffer) + #kdb_out._handle.flush() + #kdb_out._handle.close() + sys.stderr.write(config.DONE) + + +def assembly(arguments): + from kmerdb import graph + + + + + + + + + assert type(arguments.kdbg) is str, "kdbg must be a str" + sfx = os.path.splitext(arguments.kdbg)[-1] + + + + + + if sfx != ".kdb" and sfx != ".kdbg": # A filepath with invalid suffix + raise IOError("Viewable .kdb(g) filepath does not end in '.kdb' or '.kdbg'") + elif not os.path.exists(arguments.kdbg): + raise IOError("Viewable .kdb(g) filepath '{0}' does not exist on the filesystem".format(arguments.kdbg)) + elif sfx == ".kdbg": + with graph.open(arguments.kdbg, mode='r', slurp=True) as kdbg_in: + metadata = kdbg_in.metadata + + N = len(kdbg_in.n1) + n1_dtype = metadata["n1_dtype"] + n2_dtype = metadata["n2_dtype"] + weights_dtype = metadata["weights_dtype"] + + + if metadata["version"] != config.VERSION: + logger.warning("KDB version is out of date, may be incompatible with current KDBReader class") + if arguments.kdb_out is None or (arguments.kdb_out == "/dev/stdout" or arguments.kdb_out == "STDOUT"): # Write to stdout, uncompressed + if arguments.header: + yaml.add_representer(OrderedDict, util.represent_yaml_from_collections_dot_OrderedDict) + print(yaml.dump(metadata, sort_keys=False)) + print(config.header_delimiter) + logger.info("Reading from file...") + logger.debug("I cut off the json-formatted unstructured column for the main view.") + try: + + + + nodes = list(set(kdbg_in.n1 + kdbg_in.n2)) + edges = list(zip(kdbg_in.n1, kdbg_in.n2, list(map(lambda w: {'weight': w} , kdbg_in.weights)))) + + graph.create_graph(nodes, edges) + + except BrokenPipeError as e: + logger.error(e) + raise e + + + + +def make_kdbg(arguments): + """ + Another ugly function that takes a argparse Namespace object as its only positional argument + + Basically, from a fasta file, I want to generate a file format that consists of unique row ids, k-mer ids, adjacency list, + + and finally an int field (0 represents unused) representing the order of the row-kmer being used in a graph traversal. + + Note, that the .kdbg format may not be easily regenerated from a k-mer count vector alone, and thus a .fasta/.fastq is needed. + + The goal isn't to yet implement the traversal algorithm in this first commit. But instead I'd like to get the format specified. + """ + from multiprocessing import Pool + + + import jsonschema + + import numpy as np + + from kmerdb import graph, kmer, util + from kmerdb.config import VERSION + + + + logger.debug("Printing entire CLI argparse option Namespace...") + logger.debug(arguments) + + # The extension should be .kdb because I said so. + logger.info("Checking extension of output file...") + if os.path.splitext(arguments.kdbg)[-1] != ".kdbg": + raise IOError("Destination .kdbg filepath does not end in '.kdbg'") + + file_metadata = [] + theoretical_kmers_number = 4**arguments.k # Dimensionality of k-mer profile + + + logger.info("Parsing {0} sequence files to generate a k-mer adjacency list...".format(len(list(arguments.seqfile)))) + infile = graph.Parseable(arguments) # NOTE: Uses the kmerdb.graph module + + + logger.info("Processing {0} fasta/fastq files across {1} processors...".format(len(list(arguments.seqfile)), arguments.parallel)) + logger.debug("Parallel (if specified) mapping the kmerdb.parse.parsefile() method to the seqfile iterable") + logger.debug("In other words, running the kmerdb.parse.parsefile() method on each file specified via the CLI") + if arguments.parallel > 1: + with Pool(processes=arguments.parallel) as pool: + # data files_metadata + data = pool.map(infile.parsefile, arguments.seqfile) # Returns a list of k-mer ids + else: + # data files_metadata + data = list(map(infile.parsefile, arguments.seqfile)) + """ + Summary statistics and metadata structure + """ + + nullomer_ids = [] + counts = [] + for d in data: + nullomer_ids.extend(d[3]) + counts.extend(d[2]) + file_metadata.append(d[1]) + N = 4**arguments.k + + + all_observed_kmers = int(np.sum(counts)) + + unique_nullomers = len(set(nullomer_ids)) + + unique_kmers = int(np.count_nonzero(counts)) + + # Key assertion + assert unique_kmers + unique_nullomers == theoretical_kmers_number, "kmerdb | internal error: unique nullomers ({0}) + unique kmers ({1}) should equal 4^k = {2} (was {3})".format(unique_nullomers, unique_kmers, theoretical_kmers_number, unique_kmers + unique_nullomers) + #logger.info("created a k-mer composite in memory") + + + metadata=OrderedDict({ + "version": VERSION, + "metadata_blocks": 1, + "k": arguments.k, + "total_kmers": all_observed_kmers, + "unique_kmers": unique_kmers, + "unique_nullomers": unique_nullomers, + "sorted": arguments.sorted, + "n1_dtype": "uint64", + "n2_dtype": "uint64", + "weights_dtype": "uint64", + "tags": [], + "files": file_metadata + }) + + + logger.info("Validating aggregated metadata structure for .kdbg header...") + try: + np.dtype(metadata["n1_dtype"]) + np.dtype(metadata["n2_dtype"]) + np.dtype(metadata["weights_dtype"]) + except TypeError as e: + logger.error(metadata) + logger.error(e) + logger.error("kmerdb encountered a type error and needs to exit") + raise TypeError("Incorrect dtype detected in metadata structure. Internal error.") + + try: + jsonschema.validate(instance=metadata, schema=config.graph_schema) + except jsonschema.ValidationError as e: + logger.debug(e) + logger.error("kmerdb.graph.KDBGReader couldn't validate the header/metadata YAML") + raise e + + logger.debug("Validation complete.") + + + sys.stderr.write("\n\n\tCompleted summation and metadata aggregation across all inputs...\n\n") + + + + """ + ACCUMULATE ALL EDGE WEIGHTS ACROSS ALL FILES + """ + """ + Step 1: initialize the final edge datastructure, a hashmap, keyed on a pair of k-mer ids, and containing only an integer weight + """ + # Initialize empty data structures + all_edges_in_kspace = {} + n1 = [] + n2 = [] + weights = [] + # Loop over the input files and initialize the dictionary on the valid pairs, setting them to 0. + for d in data: + edges, h, counts, nullomers = d + pair_ids = edges.keys() + for p in pair_ids: + all_edges_in_kspace[p] = 0 + """ + Step 2: Accumulate all edge weights across all files + """ + for d in data: + edges, h, counts, nullomers = d + pair_ids = edges.keys() + + for p in pair_ids: + try: + all_edges_in_kspace[p] += edges[p] + except KeyError as e: + logger.error("Unknown edge detected, evaded prepopulation. Internal error.") + raise e + """ + Step 3: Add edges (2 k-mer ids) and weight to lists for conversion to NumPy array. + """ + for e in all_edges_in_kspace.keys(): + n1.append(e[0]) + n2.append(e[1]) + weights.append(all_edges_in_kspace[e]) + """ + N would be the number of edges, pairs of nodes, and weights. + """ + N = len(n1) + + logger.debug("Initializing Numpy arrays of {0} uint for the edges and corresponding weight...".format(N)) + n1 = np.array(n1, dtype=metadata["n1_dtype"]) + n2 = np.array(n2, dtype=metadata["n2_dtype"]) + weights = np.array(weights, dtype=metadata["weights_dtype"]) + logger.info("Initialization of profile completed, using approximately {0} bytes per array".format(n1.nbytes)) + + """ + Write the YAML metadata header to the .kdbg file + """ + yaml.add_representer(OrderedDict, util.represent_yaml_from_collections_dot_OrderedDict) + sys.stderr.write(yaml.dump(metadata, sort_keys=False)) + + sys.stderr.write("\n\n\n") + logger.info("Wrote metadata header to .kdbg...") + + + """ + Cause pandas isn't edge-y enough. + """ + # Convert the list of numpy arrays (the uint64 3-tuple of the edge) to pandas dataframe + #df = pd.DataFrame(twoD_weighted_edge_list) + #df.to_csv(sys.stdout, sep=arguments.output_delimiter, index=False) + + + """ + Write the dataset (weighted edge list) to a file with '.kdbg' as its suffix. + """ + kdbg_out = graph.open(arguments.kdbg, mode='wb', metadata=metadata) + + try: + sys.stderr.write("\n\n\nWriting edge list to {0}...\n\n\n".format(arguments.kdbg)) + for i, node1 in enumerate(n1): + + node2 = n2[i] + w = weights[i] + + tupley = (i, node1, node2, w) + tupley_dl = np.array(tupley, dtype="uint64") + if arguments.quiet is False: + print("{0}\t{1}\t{2}\t{3}".format(i, node1, node2, w)) + # i, node1, node2, weight + kdbg_out.write("{0}\t{1}\t{2}\t{3}\n".format(i, node1, node2, w)) + finally: + kdbg_out._write_block(kdbg_out._buffer) + kdbg_out._handle.flush() + kdbg_out._handle.close() + + + """ + Done around n birfday + 3/20/24 + + Final statistics to stderr + """ + sys.stderr.write("\n\n\nFinal stats:\n\n\n") + + sys.stderr.write("Total k-mers processed: {0}\n".format(all_observed_kmers)) + sys.stderr.write("Unique nullomer count: {0}\n".format(unique_nullomers)) + sys.stderr.write("Unique {0}-mer count: {1}\n".format(arguments.k, unique_kmers)) + sys.stderr.write("Theoretical {0}-mer number (4^{0}): {1}\n".format(arguments.k, theoretical_kmers_number)) + sys.stderr.write("="*30 + "\n") + sys.stderr.write(".kdbg stats:\n") + sys.stderr.write("-"*30 + "\n") + sys.stderr.write("Edges in file: {0}\n".format(N)) + sys.stderr.write("Non-zero weights: {0}\n".format(int(np.count_nonzero(weights)))) + sys.stderr.write("\nDone\n") + + logger.info("Done printing weighted edge list to .kdbg") + + sys.stderr.write(config.DONE) + + + def profile(arguments): """ - A complex, near-end user function that handles an arparse Namespace as its only positional argument + A complex, near-end user function that handles a argparse Namespace as its only positional argument This function handles multiprocessing, NumPy type checking and array initialization, full metadata expansion if needed. @@ -1106,6 +1473,7 @@ def profile(arguments): file_metadata = [] total_kmers = 4**arguments.k # Dimensionality of k-mer profile N = total_kmers + theoretical_kmers_number = N logger.info("Parsing {0} sequence files to generate a composite k-mer profile...".format(len(list(arguments.seqfile)))) nullomers = set() @@ -1126,7 +1494,7 @@ def profile(arguments): # 'data' is now a list of 4-tuples # Each 4-tuple represents a single file - # (counts, header_dictionary, nullomers, all_kmer_metadata) + # (edges, header_dictionary, nullomers, all_kmer_metadata) # Construct a final_counts array for the composite profile across all inputs logger.debug("Initializing large list for extended metadata") @@ -1143,9 +1511,54 @@ def profile(arguments): all_kmer_metadata = util.merge_metadata_lists(arguments.k, all_kmer_metadata, d[3]) sys.stderr.write("\n\n\tCompleted summation and metadata aggregation across all inputs...\n\n") - unique_kmers = int(np.count_nonzero(counts)) - total_nullomers = total_kmers - unique_kmers + # unique_kmers = int(np.count_nonzero(counts)) + # #total_nullomers = total_kmers - unique_kmers + + # nullomer_ids = list(map(lambda n: n[2], data)) + # all_observed_kmers = int(np.sum(list(map(lambda h: h['total_kmers'], file_metadata)))) + + + + + """ + 3/20/24 + *Massive* nullomer and count validation regression. + """ + + """ + Summary statistics and metadata structure + """ + + nullomer_ids = [] + counts = [] + file_metadata = [] + for d in data: + nullomer_ids.extend(d[2]) + counts.extend(d[0]) + file_metadata.append(d[1]) + all_observed_kmers = int(np.sum(counts)) + unique_nullomers = len(set(nullomer_ids)) + unique_kmers = int(np.count_nonzero(counts)) + + + # Key assertion + assert unique_kmers + unique_nullomers == theoretical_kmers_number, "kmerdb | internal error: unique nullomers ({0}) + unique kmers ({1}) should equal 4^k = {2} (was {3})".format(unique_nullomers, unique_kmers, theoretical_kmers_number, unique_kmers + unique_nullomers) + #logger.info("created a k-mer composite in memory") + + + # nullomer_ids = [] + + # for ns in file_metadata: + # nullomer_ids.extend(ns["nullomer_array"]) + + # unique_nullomers = len(set(nullomer_ids)) + # # + # unique_kmers = int(np.sum(list(map(lambda h: h["unique_kmers"], file_metadata)))) + + # assert unique_kmers + unique_nullomers == N, "kmerdb | internal error: unique nullomers ({0}) + unique kmers ({1}) should equal 4^k = {2} (was {3})".format(unique_nullomers, unique_kmers, N, unique_kmers + unique_nullomers) + + # all_observed_kmers = int(np.sum(counts)) logger.info("Initial counting process complete, creating BGZF format file (.kdb)...") logger.info("Formatting master metadata dictionary...") @@ -1156,7 +1569,7 @@ def profile(arguments): "k": arguments.k, "total_kmers": all_observed_kmers, "unique_kmers": unique_kmers, - "unique_nullomers": total_nullomers, + "unique_nullomers": unique_nullomers, "metadata": arguments.all_metadata, "sorted": arguments.sorted, "kmer_ids_dtype": "uint64", @@ -1164,7 +1577,7 @@ def profile(arguments): "count_dtype": "uint64", "frequencies_dtype": "float64", "tags": [], - "files": [d[1] for d in data] + "files": file_metadata }) try: @@ -1183,7 +1596,7 @@ def profile(arguments): counts = np.array(counts, dtype=metadata["count_dtype"]) frequencies = np.divide(counts, metadata["total_kmers"]) logger.info("Initialization of profile completed, using approximately {0} bytes per profile".format(counts.nbytes)) - yaml.add_representer(OrderedDict, util.represent_ordereddict) + yaml.add_representer(OrderedDict, util.represent_yaml_from_collections_dot_OrderedDict) sys.stderr.write(yaml.dump(metadata, sort_keys=False)) @@ -1211,7 +1624,10 @@ def profile(arguments): reverse_kmer_ids_sorted_by_count = np.flipud(kmer_ids_sorted_by_count) for i, idx in enumerate(reverse_kmer_ids_sorted_by_count): seq = kmer.id_to_kmer(idx, arguments.k) - kmer_metadata = kmer.neighbors(seq, arguments.k) # metadata is initialized by the neighbors + kmer_id = int(idx) + + + kmer_metadata = kmer.neighbors(seq, kmer_id, arguments.k) # metadata is initialized by the neighbors reads = [] starts = [] reverses = [] @@ -1236,50 +1652,59 @@ def profile(arguments): else: j = 0 for i, idx in enumerate(kmer_ids): - kmer_id = int(idx) seq = kmer.id_to_kmer(kmer_id, arguments.k) + kmer_id = int(idx) + + kmer_metadata = kmer.neighbors(seq, kmer_id, arguments.k) #logger.info("{0}\t{1}\t{2}\t{3}\t{4}".format(i, idx, kmer_ids[i], counts[kmer_id], frequencies[kmer_id])) - kmer_metadata = kmer.neighbors(seq, arguments.k) + all_metadata.append(kmer_metadata) - counts[idx] = counts[idx] - frequencies[idx] = frequencies[idx] + c = counts[idx] + f = frequencies[idx] logger.info("First is the implicit row index, next is a k-mer id, next is the corresponding k-mer id to the row-index (may not match from sorting), next is the count and frequencies") if arguments.quiet is not True: - print("{0}\t{1}\t{2}\t{3}".format(i, kmer_ids[idx], counts[idx], frequencies[idx])) - kdb_out.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format(i, kmer_ids[idx], counts[idx], frequencies[idx], json.dumps(kmer_metadata))) + print("{0}\t{1}\t{2}\t{3}".format(i, kmer_id, c, f)) + kdb_out.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format(i, kmer_id, c, f, json.dumps(kmer_metadata))) j += 1 else: if arguments.sorted: - duple_of_arrays = (kmer_ids, counts) + kmer_ids_sorted_by_count = np.lexsort(duple_of_arrays) - reverse_kmer_ids_sorted_by_count = np.flipud(kmer_ids_sorted_by_count) + reverse_kmer_ids_sorted_by_count = list(kmer_ids_sorted_by_count) + logger.info("FIXME before reverse : ", reverse_kmer_ids_sorted_by_count) + reverse_kmer_ids_sorted_by_count.reverse() + logger.info("FIXME after reverse : ", reverse_kmer_ids_sorted_by_count) logger.debug("K-mer id sort shape: {0}".format(len(kmer_ids_sorted_by_count))) for i, idx in enumerate(reverse_kmer_ids_sorted_by_count): kmer_id = int(kmer_ids[idx]) - logger.info("{0}\t{1}\t{2}\t{3}".format(i, kmer_ids[idx], counts[idx], frequencies[idx])) seq = kmer.id_to_kmer(kmer_id, arguments.k) - kmer_metadata = kmer.neighbors(seq, arguments.k) + kmer_metadata = kmer.neighbors(seq, kmer_id, arguments.k) + + logger.info("{0}\t{1}\t{2}\t{3}".format(i, kmer_ids[idx], counts[idx], frequencies[idx])) all_metadata.append(kmer_metadata) - counts[idx] = counts[idx] - frequencies[idx] = frequencies[idx] + c[idx] = counts[idx] + f[idx] = frequencies[idx] if arguments.quiet is not True: - print("{0}\t{1}\t{2}\t{3}".format(i, kmer_ids[idx], counts[idx], frequencies[idx])) - kdb_out.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format(i, kmer_ids[idx], counts[idx], frequencies[idx], json.dumps(kmer_metadata))) + print("{0}\t{1}\t{2}\t{3}".format(i, kmer_id, c, f)) + kdb_out.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format(i, kmer_id, c, f, json.dumps(kmer_metadata))) else: for i, idx in enumerate(kmer_ids): - kmer_id = int(idx) - logger.info("{0}\t{1}\t{2}\t{3}".format(i, kmer_ids[idx], counts[idx], frequencies[idx])) - p = profile[i] - c = counts[i] - f = frequencies[i] - seq = kmer.id_to_kmer(int(kmer_id), arguments.k) - kmer_metadata = kmer.neighbors(seq, arguments.k) # metadata is initialized by the neighbors + + + kmer_id = int(kmer_ids[idx]) + seq = kmer.id_to_kmer(kmer_id, arguments.k) + c = counts[idx] + f = frequencies[idx] + + kmer_metadata = kmer.neighbors(seq, kmer_id, arguments.k) # metadata is initialized by the neighbors + logger.info("{0}\t{1}\t{2}\t{3}".format(i, kmer_id, c, f)) + all_metadata.append(kmer_metadata) if arguments.quiet is not True: - print("{0}\t{1}\t{2}\t{3}".format(i, kmer_ids[idx], counts[idx], frequencies[idx])) - kdb_out.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format(i, kmer_ids[idx], counts[idx], frequencies[idx], json.dumps(kmer_metadata))) + print("{0}\t{1}\t{2}\t{3}".format(i, kmer_id, c, f)) + kdb_out.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format(i, kmer_id, c, f, json.dumps(kmer_metadata))) logger.info("Wrote 4^k = {0} k-mer counts + neighbors to the .kdb file.".format(total_kmers)) logger.info("Done") @@ -1289,13 +1714,21 @@ def profile(arguments): kdb_out._handle.flush() kdb_out._handle.close() + """ + Done around n birfday + 3/20/24 + + Final statistics to stderr + """ + sys.stderr.write("\n\n\nFinal stats:\n\n\n") sys.stderr.write("Total k-mers processed: {0}\n".format(all_observed_kmers)) - sys.stderr.write("Final nullomer count: {0}\n".format(total_nullomers)) + sys.stderr.write("Unique nullomer count: {0}\n".format(unique_nullomers)) sys.stderr.write("Unique {0}-mer count: {1}\n".format(arguments.k, unique_kmers)) - sys.stderr.write("Total {0}-mer count: {1}\n".format(arguments.k, total_kmers)) - - + sys.stderr.write("Theoretical {0}-mer number (4^{0}): {1}\n".format(arguments.k, theoretical_kmers_number)) + sys.stderr.write("="*30 + "\n") sys.stderr.write("\nDone\n") + + def get_root_logger(level): @@ -1372,6 +1805,29 @@ def cli(): header_parser.add_argument("-j", "--json", help="Print as JSON. DEFAULT: YAML") header_parser.add_argument("kdb", type=str, help="A k-mer database file (.kdb)") header_parser.set_defaults(func=header) + + graph_parser = subparsers.add_parser("graph", help="Generate an adjacency list from .fa/.fq files") + graph_parser.add_argument("-v", "--verbose", help="Prints warnings to the console by default", default=0, action="count") + graph_parser.add_argument("-k", default=12, type=int, help="Choose k-mer size (Default: 12)") + + graph_parser.add_argument("-p", "--parallel", type=int, default=1, help="Shred k-mers from reads in parallel") + + graph_parser.add_argument("-b", "--fastq-block-size", type=int, default=100000, help="Number of reads to load in memory at once for processing") + graph_parser.add_argument("--both-strands", action="store_true", default=False, help="Retain k-mers from the forward strand of the fast(a|q) file only") + graph_parser.add_argument("--quiet", action="store_true", default=False, help="Do not list all edges and neighboring relationships to stderr") + graph_parser.add_argument("--sorted", action="store_true", default=False, help=argparse.SUPPRESS) + #profile_parser.add_argument("--sparse", action="store_true", default=False, help="Whether or not to store the profile as sparse") + + graph_parser.add_argument("seqfile", nargs="+", type=str, metavar="<.fasta|.fastq>", help="Fasta or fastq files") + graph_parser.add_argument("kdbg", type=str, help=".kdbg file") + graph_parser.set_defaults(func=make_kdbg) + + # assembly_parser = subparsers.add_parser("assemble", help="Use NetworkX (and/or cugraph) to perform deBruijn graphs") + # assembly_parser.add_argument("-v", "--verbose", help="Prints warnings to the console by default", default=0, action="count") + # assembly_parser.add_argument("-g", "--gpu", action="store_true", default=False, help="Utilize GPU resources (requires CUDA library cugraph)") + # assembly_parser.add_argument("kdbg", type=str, help=".kdbg file") + # assembly_parser.set_defaults(func=assembly) + view_parser = subparsers.add_parser("view", help="View the contents of the .kdb file") view_parser.add_argument("-v", "--verbose", help="Prints warnings to the console by default", default=0, action="count") diff --git a/kmerdb/config.py b/kmerdb/config.py index 79676ea..2e3d9c7 100644 --- a/kmerdb/config.py +++ b/kmerdb/config.py @@ -17,9 +17,58 @@ -VERSION="0.7.6" +VERSION="0.7.8" header_delimiter = "\n" + ("="*24) + "\n" +graph_schema = { + "type": "object", + "properties": { + "version": {"type": "string"}, + "metadata_blocks": {"type": "number"}, + "k": {"type": "number"}, + "total_kmers": {"type": "number"}, + "unique_kmers": {"type": "number"}, + "total_nullomers": {"type": "number"}, + "sorted": {"type": "boolean"}, + "n1_dtype": {"type": "string"}, + "n2_dtype": {"type": "string"}, + "weights_dtype": {"type": "string"}, + "tags": { + "type": "array", + "items": {"type": "string"} + }, + "files": { + "type": "array", + "items": { + "type": "object", + "properties": { + "filename": {"type": "string"}, + "sha256": { + "type": "string", + "minLength": 64, + "maxLength": 64 + }, + "md5": { + "type": "string", + "minLength": 32, + "maxLength": 32 + }, + "total_reads": {"type": "number"}, + "total_kmers": {"type": "number"}, + "unique_kmers": {"type": "number"}, + "nullomers": {"type": "number"} + }, + "required": ["filename", "sha256", "md5", "total_reads", "total_kmers", "unique_kmers", "nullomers"] + } + }, + "comments": { + "type": "array", + "items": {"type": "string"} + } + }, + "required": ["version", "metadata_blocks", "k", "tags", "files", "total_kmers", "unique_kmers", "unique_nullomers", "n1_dtype", "n2_dtype", "weights_dtype"] +} + metadata_schema = { "type": "object", "properties": { @@ -28,7 +77,7 @@ "k": {"type": "number"}, "total_kmers": {"type": "number"}, "unique_kmers": {"type": "number"}, - "unique_nullomers": {"type": "number"}, + "total_nullomers": {"type": "number"}, "metadata": {"type": "boolean"}, "sorted": {"type": "boolean"}, "profile_dtype": {"type": "string"}, @@ -126,11 +175,6 @@ https://matthewralston.github.io/kmerdb -=========================================== -| P y P I | -=========================================== -https://pypi.org/project/kmerdb/ - =========================================== | G i t h u b | =========================================== @@ -142,23 +186,54 @@ | N o t Very H u m e r u s | ============================================ -https://matthewralston.github.io/blog/kmer-database-format-part-1 + Please cite my repository in your work! Feel free to e-mail me or reach out! Please check the README.md for more details. -https://github.com/MatthewRalston/kmerdb +https://matthewralston.github.io/kmerdb/ """ + + + + + + + + +SPONGEBOB = """ +⢀⣠⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣄⡀ +⣿⡋⠉⠉⠉⠉⠉⠉⠉⠉⠉⠉⠉⠉⠉⠉⠉⠉⠉⠉⠉⠉⢙⣿ +⣿⡇⠀⠀⠀⣠⣴⠾⠿⠷⣦⡀⢀⣴⠾⠿⠷⣦⣄⠀⠀⠀⢸⣿ +⢸⡇⠀⠀⢰⡟⠁⠀⣀⣀⠈⢿⡿⠁⣀⣀⠀⠈⢻⡆⠀⠀⢸⡇ +⢸⣷⠀⠀⢸⡇⠀⠀⠿⠿⠁⣸⣇⠈⠿⠿⠀⠀⢸⡇⠀⠀⣾⡇ +⠘⣿⠀⠀⠈⠻⣦⣄⣀⣤⣾⠛⠛⣷⣤⣀⣠⣴⠟⠁⠀⠀⣿⠃ +⠀⣿⠀⠘⢷⣄⠀⠉⠉⠙⢿⠄⠠⡿⠋⠉⠉⠀⣠⡾⠃⠀⣿⠀ +⠀⣿⠀⠀⠀⠙⠻⢶⣦⣤⣤⣤⣤⣤⣤⣴⡶⠟⠋⠀⠀⠀⣿⠀ +⠀⣿⡄⠀⠀⠀⠀⠀⣿⣀⣹⡇⢸⣏⣀⣿⠀⠀⠀⠀⠀⢠⣿⠀ +⠀⢿⡇⠀⠀⠀⠀⠀⠉⠉⠉⠁⠈⠉⠉⠉⠀⠀⠀⠀⠀⢸⡿⠀ +⠀⢸⣿⣿⣿⣿⣟⠛⠛⠛⣻⡿⢿⣟⠛⠛⠛⣻⣿⣿⣿⣿⡇⠀ +⠀⢸⣿⣿⣿⣿⣿⣷⣤⣾⣿⣶⣶⣿⣷⣤⣾⣿⣿⣿⣿⣿⡇⠀ +⠀⠘⣿⣿⣿⣿⣿⣿⣿⣿⣿⠃⠘⣿⣿⣿⣿⣿⣿⣿⣿⣿⠃⠀ +⠀⠀⠉⠉⠉⠉⠉⠉⠉⠉⠻⣧⣼⠟⠉⠉⠉⠉⠉⠉⠉⠉⠀⠀ +⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠈⠁⠀⠀⠀⠀⠀⠀⠀ + +""" + + +# "@->- -|>" ... (@->-) +# "..., -|> ?" DONE = """ + ========================================== ----------------D O N E------------------- ========================================== -Thanks for checking out k-mer db. +Thanks for checking out kmerdb. Accept the citation notice with 'kmerdb citation' Use 'kmerdb [cmd] -h' for more details. """ @@ -203,6 +278,61 @@ # MATRIX_MASTHEAD += i*"=" + "\n" +GRAPH_MASTHEAD = """ + +========================================== + kmerdb graph +========================================== +kmerdb graph example.fa example.kdbg --prefix temp_output +----- +. +. +. +output.kdbg +output.kdb +output.kdbi +output.kdb.gi +error.log +debug.log +output.kdbg.stats.neighbors.txt +output.kdbg.stats.edges.txt +output.kdbg.stats.adjacency.txt +---jk +output.kdbg.stats_txt +--------------------- +distribution.txt +quick_stats.txt +--------------- +kmers-per-file (array ok) + +kmers-total : +kmers-avg : +singletons : +doublets : +triplets : + + +'meaningful_kmers = unique_kmers + total_kmers' (per file) : {0} = ' {1} ' ' + ' ' {2} ' | +grand_total_of_kmers : +-------------------------------------------------------------------------------------- + +(table structure + +kmer_totals : (array okay) (per file) +unique kmers : (array okay) (per file) +nullomers : (array okay) (per file) + +output.kdbg.stats.doublets +output.kdbg.stats.triplets +output.kdb.stats.average +output.k +output.kdbg.stats.paths.txt + + +""" + + + KMEANS_MASTHEAD = """ ========================================== @@ -276,18 +406,8 @@ +I make no disclaimers. -################################ -# W A R N I N G : M E M O R Y # -################################ -# The following is some 'basic' guesses about expected memory usage. -# I believe this is an oversimplification of the memory profile for technical language and representation issues. -# The profile loading function reads the array into memory from the file, assuming a certain data encoding. -# The coding is now set to 'uint64' by default, which in theory allows us to only be limited by numpy array size. -# Now that we have that out of the way. -# The integer depth we have is large. -# -# # ################## # dimensionality reduction + kmeans diff --git a/kmerdb/fileutil.py b/kmerdb/fileutil.py index 960d714..770f423 100644 --- a/kmerdb/fileutil.py +++ b/kmerdb/fileutil.py @@ -134,7 +134,7 @@ def _s3_file_download(self, seqpath, temporary=True): def parse_line(line): """ - Parses a line according to the expected syntax, and returns the python data types expected as a tuple. + Parses a line according to the expected .kdb syntax, and returns the python data types expected as a tuple. :param line: :type line: str @@ -226,7 +226,7 @@ def open(filepath, mode="r", metadata=None, sort:bool=False, slurp:bool=False): class KDBReader(bgzf.BgzfReader): """ - A class that reads KDB files, potentially just for accessing header metadata, or for reading the entire contents into numpy arrays. + A class that reads .kdb files, potentially just for accessing header metadata, or for reading the entire contents into numpy arrays. :ivar filename: str @@ -284,6 +284,7 @@ def __init__(self, filename:str=None, fileobj:io.IOBase=None, mode:str="r", max_ initial_header_data = OrderedDict(yaml.safe_load(self._buffer)) num_header_blocks = None + if type(initial_header_data) is str: logger.info("Inappropriate type for the header data.") #logger.info("Um, what the fuck is this in my metadata block?") @@ -307,9 +308,7 @@ def __init__(self, filename:str=None, fileobj:io.IOBase=None, mode:str="r", max_ addtl_header_data = yaml.safe_load(self._buffer.rstrip(config.header_delimiter)) if type(addtl_header_data) is str: logger.error(addtl_header_data) - logger.error("Couldn't determine yo' block.::/") - #logger.error("Couldn't at you goldy.") - logger.error("Sorry - Chris Brown.") + logger.error("Couldn't determine this block.::/") raise TypeError("kmerdb.fileutil.KDBReader determined the data in the {0} block of the header data from '{1}' was not YAML formatted".format(i, self._filepath)) elif type(addtl_header_data) is dict: sys.stderr.write("\r") @@ -414,7 +413,7 @@ def _slurp(self, column_dtypes:str="uint64", count_dtypes:str="uint64", frequenc np.dtype(frequencies_dtype) except TypeError as e: logger.error(e) - logger.error("kmerdb.fileutil.KDBReader.slurp encountered a TypeError while assessing the numpy datatype '{0}'...".format(dtype)) + logger.error("kmerdb.fileutil.KDBReader.slurp encountered a TypeError while assessing a numpy dtype") raise TypeError("kmerdb.fileutil.KDBReader._slurp expects the dtype keyword argument to be a valid numpy data type") if type(sort) is not bool: raise TypeError("kmerdb.fileutil.KDBReader._slurp expects the sort keyword argument to be a bool") @@ -424,7 +423,6 @@ def _slurp(self, column_dtypes:str="uint64", count_dtypes:str="uint64", frequenc N = 4**self.k # The dimension of the k-space, or the number of elements for the array num_bytes = 4 * N logger.info("Approximately {0} bytes".format(num_bytes)) - logger.info("Fly.") vmem = psutil.virtual_memory() self.kmer_ids = np.zeros(N, dtype=column_dtypes) self.counts = np.zeros(N, dtype=count_dtypes) @@ -757,7 +755,7 @@ def slurp(self, column_dtypes:str="uint64", count_dtypes:str="uint64", frequenci class KDBWriter(bgzf.BgzfWriter): """ - A wrapper class around Bio.bgzf.BgzfWriter to write a kdb file to disk. + A wrapper class around Bio.bgzf.BgzfWriter to write a .kdb file to disk. :ivar metadata: OrderedDict :ivar filename: str @@ -804,7 +802,7 @@ def __init__(self, metadata:OrderedDict, filename=None, mode="w", fileobj=None, logger.info("Constructing a new .kdb file '{0}'...".format(self._handle.name)) - yaml.add_representer(OrderedDict, util.represent_ordereddict) + yaml.add_representer(OrderedDict, util.represent_yaml_from_collections_dot_OrderedDict) if "b" in mode.lower(): metadata_bytes = bytes(yaml.dump(self.metadata, sort_keys=False), 'utf-8') metadata_plus_delimiter_in_bytes = metadata_bytes + bytes(config.header_delimiter, 'utf-8') diff --git a/kmerdb/graph.py b/kmerdb/graph.py new file mode 100644 index 0000000..92b776a --- /dev/null +++ b/kmerdb/graph.py @@ -0,0 +1,1318 @@ +''' + Copyright 2020 Matthew Ralston + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +''' + + + +import sys +import os +import io +import gzip + +import psutil +import tempfile + +import yaml, json +from collections import deque, OrderedDict +import math +import re +from itertools import chain + + +from builtins import open as _open + +import jsonschema +from Bio import SeqIO, Seq, bgzf + + +import numpy as np +import networkx as nx + +from kmerdb import fileutil, parse, kmer, config, util + +# Logging configuration +import logging +logger = logging.getLogger(__file__) + + + + +def open(filepath, mode="r", metadata=None, slurp:bool=False): + """ + Opens a file for reading or writing. Valid modes are 'xrwbt'. + Returns a lazy-loading KDBGReader object or a KDBGWriter object. + The data may be force loaded with 'slurp=True' + + + Opens a .kdbg file, still only view goals. + + Opens for reading or writing, legacy function. + + + fn sig: ( + + filepath + mode(r/w param) + metadata:None + slurp: False + + ) + + + Stands in front of the kdb classes. + + also available in more general fileutil.py + + metadata data vector (jsonschema) described in config.py + + squash target unassigned on todo (slurp) + + + nodes + weights + + dtype defs, avail in json + + graph description avail in json + + traversal not implemented. + + + + + + :param filepath: + :type filepath: str + :param mode: + :type mode: str + :param metadata: The file header/metadata dictionary to write to the file. + :type metadata: dict + :param slurp: Immediately load all data into KDBGReader + :type slurp: bool + :raise ValueError: mode parameter related errors + + :returns: kmerdb.fileutil.KDBGReader/kmerdb.fileutil.KDBGWriter + :rtype: kmerdb.fileutil.KDBGReader + """ + if type(filepath) is not str: + raise TypeError("kmerdb.graph.open expects a str as its first positional argument") + elif type(mode) is not str: + raise TypeError("kmerdb.graph.open expects the keyword argument 'mode' to be a str") + elif ("w" in mode or "x" in mode) and (metadata is None or not isinstance(metadata, OrderedDict)): + raise TypeError("kmerdb.graph.open expects an additional metadata dictionary") + elif type(slurp) is not bool: + raise TypeError("kmerdb.graph.open expects a boolean for the keyword argument 'slurp'") + modes = set(mode) + if modes - set("xrwbt") or len(mode) > len(modes): + raise ValueError("invalid mode: {}".format(mode)) + + # oooo.... + + + # .......oooOO.ooo..... + + + + # ..........o.o.o.....O.. + + + # + creating = "x" in modes + reading = "r" in modes + writing = "w" in modes + binary = "b" in modes + text = "t" in modes + + if text and binary: + raise ValueError("can't have text and binary mode at once") + elif not (creating or reading or writing): + raise ValueError("must have exactly one or read/write") + + if "r" in mode.lower(): + return KDBGReader(filename=filepath, mode=mode, slurp=slurp) + elif "w" in mode.lower() or "x" in mode.lower(): + return KDBGWriter(filename=filepath, mode=mode, metadata=metadata) + else: + raise ValueError("Bad mode %r" % mode) + + +def bytesize_of_metadata(metadata): + """ + defers to util.get_bytesize_of_metadata + """ + return util.bytessize_of_metadata + + +def parse_kdbg_table_line(kdbg_table_line:str, sort_arr:bool=False, row_dtype:str="uint64"): + """ + Parses a line according to the expected .kdbg syntax, and returns the python data types expected as a tuple. + + .kdbg + + + table def + ========================= + vector 'e' + + i uint64 + node1_id uint64 + node2_id uint64 + w uint64 + + :param line: + :type kdbg_table_line: str + :raise TypeError: .kdbg table line was not str + :raise TypeError: sort_arr was not bool + :raise ValueError: .kdbg table has 4 columns - index, node1_id, node2_id, edge_weight + :returns: node1_id, node2_id, weight + :rtype: tuple + """ + + if type(kdbg_table_line) is not str: + raise TypeError("kmerdb.graph.parse_line expects a str as its first positional argument") + elif type(kdbg_table_line) is str and kdbg_table_line == "": + raise StopIteration("empty table line") + + if type(sort_arr) is not bool: + raise TypeError("kmerdb.graph.parse_kdbg_table_line needs 'sort_arr' to be bool") + try: + np.dtype(row_dtype) + except TypeError as e: + logger.error("Invalid numpy dtype") + raise e + finally: + + + + linesplit = kdbg_table_line.rstrip().split("\t") + + if len(linesplit) != 4: + logger.error("Full line:\n{0}".format(line)) + raise ValueError("kmerdb.graph.parse_kdbg_table_line encountered a .kdbg line without 4 columns. Invalid format for edge list") + else: + i, node1_id, node2_id, weight = linesplit + i, node1_id, node2_id, weight = int(i), int(node1_id), int(node2_id), int(weight) + + e = np.array([node1_id, node2_id, weight], dtype=row_dtype) + + # Still needs the sort_array default functionality as switchable. + # Still needs the solver (ss-sp, or bfs) + + # # TODO: bfs would necessitate the objective + # vanilla cpu implementation + + # # TODO: + # Needs the node delexer. + # is this a pre or post sort? + # should be a post + # Needs the signature and the implementation + + # Objective function def + return [i] + list(e) + + + + +def parsefile(filepath:str, k:int, quiet:bool=True, b:int=50000, both_strands:bool=False): + """ + Parse a single sequence file. + + :param filepath: Path to a fasta or fastq file + :type filepath: str + :param k: Choice of k to shred k-mers with + :type k: int + :param b: Number of reads (per block) to process in parallel + :type b: int + :param both_strands: Strand specificity argument for k-mer shredding process + :type both_strands: bool + :raise TypeError: first argumentshould be a valid filepath + :raise TypeError: second argument k should be an int + :raise TypeError: quiet keyword arg should be a bool + :raise TypeError: *unimplemented* - both_strands should be a bool + :raise ValueError: Invalid length-of-0 kmer_id array produced during k-mer shredding + :raise AssertionError: relationship 4^k = unique_kmers + (unique_)nullomers found invalid + :returns: (edge_list, header, counts, nullomer_array) header_dictionary is the file's metadata for the header block + :rtype: (numpy.ndarray, dict, list, list) + + """ + + """ + Type Checking + + """ + + from kmerdb import kmer + if type(filepath) is not str: + raise TypeError("kmerdb.graph.parsefile expects a str as its first positional argument") + elif not os.path.exists(filepath): + raise OSError("kmerdb.graph.parsefile could not find the file '{0}' on the filesystem".format(filepath)) + elif type(k) is not int: + raise TypeError("kmerdb.graph.parsefile expects an int as its second positional argument") + elif type(b) is not int: + raise TypeError("kmerdb.parse.parsefile expects the keyword argument 'b' to be an int") + elif type(both_strands) is not bool: + raise TypeError("kmerdb.graph.parsefile expects the keyword argument 'both_strands' to be a bool") + + N = 4**k + counts = np.zeros(N, dtype="uint64") + + logger.debug("Initializing edge list fileparser...") + seqprsr = parse.SeqParser(filepath, b, k) + fasta = not seqprsr.fastq # Look inside the seqprsr object for the type of file + + # Initialize the kmer class for sequence shredding activities + Kmer = kmer.Kmers(k, strand_specific=not both_strands, verbose=fasta, all_metadata=True) # A wrapper class to shred k-mers with + + # Actually load in records 'recs' + recs = [r for r in seqprsr] + + logger.info("Read {0} sequences from the {1} seqparser object".format(len(recs), "fasta" if fasta else "fastq")) + + + while len(recs): + num_recs = len(recs) + logger.debug("\n\nAcquiring list of all k-mer ids from {0} sequence records...\n\n".format(num_recs)) + + + list_of_dicts = list(map(Kmer._shred_for_graph, recs)) + + + logger.info("k-mer shredding a block of {0} reads/sequences".format(num_recs)) + kmer_ids = list(chain.from_iterable(list_of_dicts)) + + + logger.debug("Successfully allocated linked-list for all k-mer ids read.") + + num_kmers = len(kmer_ids) + + if num_kmers == 0: + raise ValueError("No k-mers to add. Something likely went wrong. Please report to the issue tracker") + else: + sys.stderr.write("\nAccumulating all k-mers from this set of records...") + for kmer in kmer_ids: + counts[kmer] += 1 + + + # END WHILE routine + # load_more_records, according to 'block size'. Legacy syntax + recs = [r for r in seqprsr] # The next block of exactly 'b' reads + # This will be logged redundantly with the sys.stderr.write method calls at line 141 and 166 of parse.py (in the _next_fasta() and _next_fastq() methods) + #sys.stderr("\n") + # JUST LOGGING + logger.info("Read {0} more records from the {1} seqparser object".format(len(recs), "fasta" if fasta else "fastq")) + + + + + + logger.info("Read {0} k-mers from the input file".format(len(kmer_ids))) + + sys.stderr.write("\n\n\n") + logger.info("K-mer counts read from input(s)") + sys.stderr.write("="*40 + "\n") + + logger.info("K-space dimension: {0}".format(4**k)) + logger.info("Number of k-mers: {0}".format(len(kmer_ids))) + sys.stderr.write("\n\n\n") + + + logger.info("Constructing weighted edge list...") + edge_list = make_graph(kmer_ids, k, quiet=quiet) + logger.info("Number of edges: {0}".format(len(edge_list))) + sys.stderr.write("\n\nEdge list generation completed...\n") + + unique_kmers = int(np.count_nonzero(counts)) + # FIXME + num_nullomers = int(len(list(np.where(counts == 0)[0]))) + + + all_kmer_ids = list(range(N)) + is_nullomer = np.where(counts == 0) + kmer_ids_np = np.array(all_kmer_ids, dtype="uint64") + + nullomer_array = list(kmer_ids_np[is_nullomer]) + + assert num_nullomers == N - unique_kmers, "kmerdb.graph.parsefile found inconsistencies between two ways of counting nullomers. Internal error." + + + seqprsr.total_kmers = int(np.sum(counts)) + seqprsr.unique_kmers = unique_kmers + seqprsr.nullomers = num_nullomers + seqprsr.nullomer_array = nullomer_array + + + return (edge_list, seqprsr.header_dict(), counts, seqprsr.nullomer_array) + +def make_graph(kmer_ids:list, k:int=None, quiet:bool=True): + """ + Make a neighbor graph from a k-mer id list. + + More specifically, this is a edge list + + Even more so, an edge list of technically adjacent k-mers in a .fa or .fq file. + + Taks from that what you will. It means its a 1D list with implicit, i.e. testable order, and no metadata. + + This particular adjacency list will have redundant relationships: + + i.e. they have been observed again elsewhere in the .fa sequence (1 or multiple) or the .fq file (millions of reads) + + So they are simply tallied. + + + :param kmer_ids: list of k-mer ids + :type kmer_ids: list + :param k: Choice of k to shred k-mers with + :type k: int + :param quiet: Verbosity parameter + :type quiet: bool + :raise TypeError: first argument was not a list + :raise TypeError: keyword argument k was not an inteer + :raise TypeError: quiet keyword arg should be a bool + :raise AssertionError: A *suppressed* error indicating whether two subsequent ids in the kmer_ids array were found to be unrelated by their sequences. Suppressed to support multi-sequence .fa/.fq files. + :returns: An edge list, keyed on a k-mer id 2-tuple e.g. edges[(kmer_id1, kmer_id2)] = weight + :rtype: dict + + + """ + + + """ + DEBUG + + bad_pair + + A debugging parameter for assessing specific k-mer ids. The debugging process for self-assessing the k-mer pairs and the nature of the non-adjacency is not developed. + + 3/18/24 + + More specifically, i am still encountering the bug and haven't developed the algorithm beyond this point yet. + I'm still working on developing logging and processing throughout based on this debug variable set at this point in the variable namespace, and lexical order of the code. + """ + + bad_pair = (3050519, 3908357) # A debugging constant used throughout + + + if kmer_ids is None or type(kmer_ids) is not list: + raise TypeError("kmerdb.graph.make_graph expects a list as its only positional argument") + elif k is None or type(k) is not int: + raise TypeError("kmerdb.graph.make_graph expects the keyword argument k to be an int") + elif quiet is None or type(quiet) is not bool: + raise TypeError("kmerdb.graph.make_graph expects the keyword argument quiet to be a bool") + + # Initialize total_list of possible k-mer neighbor pairs from the edge list. + # Step 1. Complete + + # # NIICE. + # neighbors = None + # #neighbors = + + + + # arrangements[e] + + + # # this is an accumulator for lesser edges (edges where both count/frequency is exceedingly exceedingly rare.) air.air. + + # for e in edges_ids: + # if (arrangements[e] == 1): + # singletons[e] +=1 + # elif (arrangements[e] == 0): + # nullomers[e] += 1 + # elif (arrangements[e] == max(arrangements.values())): + # maxes = max(arrangements.values()) + + # all_neighbors[e] = 0 + + # all_all_all_neighbors = None + # """ + # Iterate over all k-mer ids, creating lists of neighbors derived from the kmer. + # Omit the self relationship, as the kmer is not a neighbor of itself. + + # 3/18/24 + + # BACKLOG + # TODO item + # this isn't complete, as prepopulating the possible edges in k-space, is still in process among other things. + # Convert to k-mer ids and add the adjacency tuple to the output + + # A minimal neighbor graph is a dictionary, keyed by the pair + # """ + + all_edges_in_kspace = {} + + + """ + First we calc the possible neighbor space for the subspace of k-space observed in the file, and all of its 8 neighbors + obtained by taking off the leading character, and appending characters to the right of the k-1-mer. + + And vice versa. + + 8*num_kmers_observed + + The first loop to populate that subspace p is simple enough that simplification or better than constant time would not be necessary. + + """ + + + + """ + ["Actggtca"] = [" + """ + neighbors = {} + + + + for i in range(len(kmer_ids)): + + k_id = kmer_ids[i] + + + current_kmer = kmer.id_to_kmer(kmer_ids[i], k) + common_seq = current_kmer[1:-1] + + local_neighbors = kmer.neighbors(current_kmer, k_id, k, quiet=quiet) + + neighbors[k_id] = local_neighbors + + + # logger.error(" ----------------- 15633431 + 12202077 --------------- END ") + + + # logger.error(" |||| ========= + = || = - = || = + = || = - = || + = =========") + + # 3/18/24 + # LEGACY + + + # marked as legacy + + # if k_id == 15633431 or k_id == 12202077: + # logger.debug("-"*11) + + # logger.debug("K-mer {0} and all (non-self) neighbors:".format(k_id)) + # logger.debug("{0} - {1}".format(current_kmer, "k1 + c")) + # logger.debug("-"*11) + # logger.debug("<:" + ",".join(char_first) + " |||| " + common_seq + " |||| " + ",".join(char_last) + ">:") + # logger.debug(",".join(list(map(str, char_first_ids))) + " |||| " + ",".join(list(map(str, char_last_ids)))) + + + # logger.debug("Okay, so this is just the k-mer + either char first (left) or char last (right)") + # for pair_ids in just_eight_edges: + + # if 15633431 in pair_ids or 12202077 in pair_ids: # ACGT + # print(pair_ids) + + + # all_edges_in_kspace[pair_ids] = 0 + + """ + Old win condition. I wanted to see that a neighbor pair that was originally not found: + The 12-mers with the following ids, which had this subsequence in common 'GTGGATAACCT', was not found in the keys of the edge list dictionary. + """ + + # missing_key = (15633431, 12202077) + # if missing_key in all_edges_in_kspace.keys(): + # print("ITS THERE! LIAR") + # print("EDGE OF KSPACE") + # print("{0} : {1}".format(missing_key, all_edges_in_kspace[missing_key])) + # logger.error(" |||| ========= + = || = - = || = + = || = - = || + = =========") + + # if (15633431, 12202077) in just_eight_edges: + # print("Sequence:") + # print(current_kmer) + # print(just_eight_edges) + + # print("SUCCESS. This k-mer, current k-mer : {0}|{1} has 8 neighbors. {2} . None should be self".format(k_id, current_kmer, just_eight_edges)) + # #print(all_edges_in_kspace) + # sys.exit(1) + logger.info("adjacency space completed...") + continue + + + """ + At this point, the adjacency structure is a simple list of tuples. Redundant tuples may exist in disparate parts of the graph. + + The structure of the neighbor_graph is [(from_node_id, to_node_id), ...] + + Next, the redundant k-mer id pairs are tallied, so that the edge weights are calculated + """ + + error_count = 0 + + for i in range(len(kmer_ids)): + if i+1 == len(kmer_ids): + break + else: + """ + The following constitutes a 'neighboring' pair/edge of two k-mers (nodes): + the k-mers are sequential in the original k-mer id list (implicitly sorted by virtue of the orientation of the read in the .fasta/.fastq file) + this list of k-mers actual neighbors can be validated by inspecting debug level output below (-vv). + """ + # 3/18/24 + # premature optimization found. not gonna discuss. + # 3/18/24 + # actually not? the point of the 'adjacency' in the simple list form from the KDBGReader invocation of the kmers structure from the /.fa/ format... + # so its capturing the raw kmer array (of ids) from .fa and/or .fq data by virtue of some existing features + # and it's collecting the proper ordering of the k-mer relationship integrity across the lines of the .fq + # pair = (the current k-mer and its nearest neighbor in the sequence file i.e. whether or not it was adjacenct in the 'sequence' proper or whether the k-mer is in a list with multiple sequences, as in given from a .fq file, such that adjacent *reads* in the file produce a k-mer input list that has k-mer ids in a list that represents residue positions in both a .fa and a .fq file. Implicit order is understood by the programmer using this, but the computer itself still has no knowledge of whether a k-mer (or residue) in the file is adjacent in the genome coordinates to its literal neihbor in the list, vs on a subsequent chromosome, or Illumina read in the case of .fq input. + """ + Basically at this point we haven't guaranteed anything about the input k-mer list. It's just been parsed and passed in from either a .fa or .fq file. + Okay so now we're creating an artificial pair of k-mers, a relationship between two ids, that also constitutes a k-1-mer in common, regardless of the orientation of the 'neighbor' relationship. Additional assert statements below + + Essentially, this guarantees that, by having the k-2-mer in common, that the current_kmer and any of its 8 possible neighbors have at least a k-2mer in common among all types of neighbor relaionships. + In addition, the 'neighbor' relationship between the current_kmer and a single neighbor, also has a k-1-mer in common. + Initialize the k-mer pair (duple of ids) in the all_edges_in_kspace var, capturing the pair_id space in the keys iterable from the .keys() + """ + + current_kmer_id = kmer_ids[i] + current_kmer = kmer.id_to_kmer(current_kmer_id, k) + pair = (current_kmer, kmer.id_to_kmer(kmer_ids[i+1], k)) + pair_ids = tuple(map(kmer.kmer_to_id, pair)) + + common_seq = current_kmer[1:] + logger.debug("Kmer_id: = kmer_ids[i] = ||| {0}".format(current_kmer_id)) + logger.debug(" -----pair: = '{0}' <-> '{1}'".format(pair[0], pair[1])) + all_edges_in_kspace[pair_ids] = 0 + # Create the neighbor lists from the list of chars, prepended or appended to the k-mer suffix/prefix + """ + BOGUS ASSERTIONS AND NONGUARANTEES... + """ + assert len(current_kmer) == k + assert len(common_seq) == k-1 + + logger.debug("Validations completed for k-mer {0}.".format(current_kmer_id)) + k1 = current_kmer[1:] # 11 characters (k - 1) remove the left most char, so new char goes on end to make the neighbor + # not_the_king_but_the_rook = None + k2 = current_kmer[:-1] # 11 characters ( k - 1) remove the final character so new char is prepended + # Do we even assert these? + + # Comments: + # The char_last requires its first char deleted. + # Then a character is prepended to the beginning to the reverse of the sequence, or appending to the forward sequence. + # The char_first requires its terminal char deleted, and then 4 residue characters are prepended to the forward sequence, or appended to the childless reverse + # we call char_first 'childless' because it is assigned responsibility for the 4 interchangables on its prepend, while it's oldest parent is removed. + """ + current_kmer + pair : the pair of adjacent k-mers with the k-1mer as identical sequence + pair_ids : ids + k1 : k-1mer current_kmer[1:] left-most char ommitted, (neighbors *will* get appended to right side) + k2 : k-1mer current_kmer[:-1] right-most char ommitted, neighbors prepended + + # e.g. ACTGACTG + # pair0 = CTGACTG (k-1 mer via first char removed. k-1 mer that receives appends. + # pair1 = ACTGACT (k-1 mer via last char removed. k-1 mer that gets at its prepend. + + DECLARATIONS FINISHED + """ + # Commented 3/11/24 ish thanks nic + # Working on some bills, then back to the drawing board. + # I mean, aside from a lab job, if i could wfh or work localish then taking a month or so off to go to the mountains for soil sampling. + # I'd really want to go with a guide or expert. + # That way I could sample the most diverse ecosystems for sampling + # But prior to that I'd need the go-kit prototype. + # Maybe a soil sampling protocol, expert backpack, sampling fluids/solvents, portable lab cooler. + # Then run the samples off of some grant. + """ + LOCAL + + B A D P A I R S + debuging problematic pair of k-mer ids + """ + if pair_ids == bad_pair: + logger.error("Problematic pair of k-mer ids identified...") + logger.error("Pair: ({0})".format(", ".join(pair_ids))) + raise ValueError("DEBUGGING PURPOSES ONLY: Identified a pair of k-mers noted as problematic. If you see this error, file an issue at https://github.com/MatthewRalston/kmerdb") + + try: + """ + This is real concise. performance focused, not too much style. + but this is the best way in python to concisely assert that the string slice is equivalent from k-mer to k-mer neighbor. Ensuring this throughout is damn routine. + essentially, this code block is added as indication that these assertions hold on a single sequence .fa file + by ensuring that sequential k-mer ids retrieved (via sliding window) from a single sequence .fa are, in fact, related by a common sequence. + when the assertion error is triggered, it is most likely that this is due to subsequent reads in a .fastq file, resulting in an erroneous 'pair' of k-mers from the algorithm above, targeting the n'th and n+1'th k-mers in the kmer_id array. + """ + assert pair[0][1:] == pair[1][:-1], "kmerdb.graph expects neighboring k-mers to have at least k-1 residues in common. Must be from a fastq file." + assert (common_seq in pair[0]) and (common_seq in pair[1]), "kmerdb.graph expects the common_seq be in both sequences of the pair." + except AssertionError as e: + error_count += 1 + logger.error(pair) + logger.warning("This k-mer pair should have this subsequence in common: {0}".format(common_seq)) + logger.warning("k1: '{0}' k2: '{1}'".format(k1, k2)) + logger.warning("It is explicitly assumed this is from .fastq input") + #logger.debug("Types: pair[0] : {0} pair[1] : {1}".format(type(pair[0]), type(pair[1]))) + logger.debug("'{0}' == '{1}'".format(pair[0][1:], pair[1][:-1])) + logger.debug("Is pair0[1:] == pair1[:-1]? : {0}".format(pair[0][1:] == pair[1][:-1])) + logger.debug(e) + pass + try: + # Accumulate for each edge pair + #if (15633431, 12202077) == pair_ids: + if bad_pair == pair_ids: + logger.error("\n\nin accumulator... cannot identify the problem for this pairing again. It is in the wrong order in the key submission, and can't be recovered for some reason\n\n") + logger.error("BAD KEY PAIR DETECTED") + + logger.error(bad_pair) + + sys.exit(1) + """ + ACCUMULATOR --------- DONE + """ + all_edges_in_kspace[pair_ids] += 1 + if quiet is False: + logger.debug("Edge: {0} => {1}".format(pair[0], pair[1])) + logger.debug(""" + ╱|、 +(˚ˎ 。7 + |、˜〵 +じしˍ,)ノ +""") + except KeyError as e: + logger.error("Invalid key(pair) used to access edge list") + sys.stderr.write("PAIR: {0} => {1}\n".format(pair[0], pair[1])) + sys.stderr.write("pair ids: {0}\n".format(pair_ids)) + logger.error(30*"=") + for i, p in enumerate(all_edges_in_kspace.keys()): + if bad_pair == pair_ids: + logger.error("Debugging the 'bad pair', problematic k-mer id pair...") + logger.error(pair_ids) + raise e + # print("heeeeeey, wow thens doo the prin funshen") + # print(pair_ids, pair, "wow thans prin funshun ( ---------)_______________________prin fun-shun_______________________________________( --------------)") + # print("print funshun you're so smart and tall") + # print("print funshun why did you sit next to me") + # print("function") + + + maxlen = 15 # Placeholder + p1str = str(pair_ids[0]) + p2str = str(pair_ids[1]) + p1str = p1str + (maxlen - len(p1str))*" " + p2str = p2str + (maxlen - len(p2str))*" " + + sys.stderr.write("k-mer 'pair' ({0}, {1}) adjacency from input file(s) verified".format(p1str, p2str)) + sys.stderr.write("\r") + continue + + + if error_count > 0: + logger.warning("\n") + logger.warning("\n") + logger.warning("\n\n\nNOTE: ADJACENCY ERRORS DETECTED: Found {0} 'improper' k-mer pairs/adjacencies from the input file(s),\n where subsequent k-mers in the k-mer id array (produced from the sliding window method over input seqs/reads) did not share k-1 residues in common.\n These *may* be introduced in the array from the last k-mer of one seq/read (in the .fa/.fq) and the first k-mer of the next seq/read.\n".format(error_count)) + sys.stderr.write("\n\n\n") + sys.stderr.write("All edges populated *and* accumulated across k-mer id arrays from inputs.\n") + logger.info("'Graph' generation complete") + + + # This variable is unrestricted. A larger k may inflate this var beyond the memory limits of the computer where the observed k-space can be represented with enough resolution through enough profile diversity. + """ + node_pairs and the k-space diversity + + + Depending on inputs, this may become large. + """ + return all_edges_in_kspace + + + +def create_graph(nodes:list, edge_tuples:list, gpu:bool=False): + + + + if nodes is None or type(nodes) is not list or not all(type(n) is not int for n in nodes): + raise TypeError("kmerdb.graph.create_graph expects the first argument to be a list of ints") + elif edge_tuples is None or type(edge_tuples) is not list or not all(len(e) != 3 for e in edge_tuples) or not all((type(e[0]) is int and type(e[1]) is int) for e in edge_tuples): + raise TypeError("kmerdb.graph.create_graph expects the second argument to be a list of tuples of length 2") + elif gpu is None or type(gpu) is not bool: + raise TypeError("kmerdb.graph.create_graph expects the keyword argument gpu to be a bool") + + + """ + Now we make the networkx graph + """ + G = nx.Graph() + + G.add_nodes_from(nodes) + + + G.add_edges_from(edge_tuples) + + + + + + +def w_lexer(): + pass + + +def v_order_lexer(e:tuple, asc:bool=False): + """ + Uh. + """ + if type(asc) is not bool: + raise TypeError("kmerdb.graph requires valid bool keyword arg 'asc'") + + if type(e) is not tuple: + logger.error("kmerd.graph.threetuple_v_order_lexer needs a valid tuple") + + sys.exit(1) + exit(1) + elif len(e) != 2: + raise IndexError("kmerdb.graph.v_order_lexer requires length 2") + elif asc is not bool: + logger.error("kmerdb.graph.v_order_lexer accepts the sort order as an input") + raise TypeError("kmerdb.graph requires a lexical order") + elif asc is False: + raise RuntimeError("requires the order") + + + if len(e) != 3: + logger.error("kmerdb.graph invalid input") + raise TypeError("kmerdb.graph requires a 3-tuple") + + n1, n2, w = e + + # Maybe... + n1 = int(n1) + n2 = int(n2) + + # This ... smh + if asc is True: + if n1 > n2: + return (n2, n1, w) + elif n2 > n1: + return (n1, n2, w) + else: + raise RuntimeError('x') + elif asc is False: + if n1 > n2: + return (n1, n2, w) + elif n2 > n1: + return (n2, n1, w) + else: + raise RuntimeError('x') + else: + + logger.error("idgaf") + return (n1, n2, w) + + +def randomsort_lexer(g): + return (random.randint(0, 1428945), random.randint(0, 1428945), random.randint(0, 1428945)) + +def wrupsort_lexer(g): + return sorted([g[0], g[1]]) + + + +def wrongsort_lexer(g, asc:bool=False): + if asc is True: + if n1 > n2: + return (n1, n2, w) + elif n2 > n1: + return (n2, n1, w) + else: + raise RuntimeError('x') + elif asc is False: + if n1 > n2: + return (n2, n1, w) + elif n2 > n1: + return (n1, n2, w) + else: + raise RuntimeError('x') + + + + +class KDBGReader(bgzf.BgzfReader): + """ + A class to read .kdbg files. + + + + + 1 args + + 8 keyword args + + fileobj + mode + n1_dtype + n2_dtype + weights_dtype + sort + slurp + + :ivar filename: The choice of k to shred with + :ivar fileobj: An existing fileobject from io.IOBase + :ivar mode: read/write mode + :ivar n1_dtype: NumPy dtype + :ivar n2_dtype: NumPy dtype + :ivar weights_dtype: NumPy dtype + :ivar sort: *unimplemented* - uh.. sort the .kdbg edges somehow + :ivar slurp: Autoload the .kdbg file + + + """ + def __init__(self, filename:str, fileobj:io.IOBase=None, mode:str="r", n1_dtype:str="uint64", n2_dtype:str="uint64", weights_dtype:str="uint64", sort:bool=False, slurp:bool=False): + """ + A wrapper around Bio.bgzf.BgzfReader + + :param filename: A valid filepath + :type filename: str + :param fileobj: An existing fileobject from io.IOBase + :type fileobj: io.IOBase + :param mode: read/write mode + :type mode: str + :param n1_dtype: NumPy dtype + :type n1_dtype: str + :param n2_dtype: NumPy dtype + :type n2_dtype: str + :param weights_dtype: NumPy dtype + :type weights_dtype: str + :param sort: *unimplemented* - uh.. sort the .kdbg edges somehow + :type sort: bool + :param slurp: Autoload the .kdbg file + :type slurp: bool + :raise TypeError: fileobj was not a file-type interface, inheriting from io.IOBase + :raise TypeError: filename was not a str + : + + """ + """ + + fileobj type checking + """ + + + if fileobj is not None and not isinstance(fileobj, io.IOBase): + raise TypeError("kmerdb.graph.KDBGReader expects the keyword argument 'fileobj' to be a file object") + if filename is not None and type(filename) is not str: + raise TypeError("kmerdb.graph.KDBGReader expects the keyword argument 'filename' to be a str") + elif mode is not None and type(mode) is not str: + raise TypeError("kmerdb.graph.KDBGReader expects the keyword argument 'mode' to be a str") + elif n1_dtype is not None and type(n1_dtype) is not str: + raise TypeError("kmerdb.graph.KDBGReader expects the keyword argument 'n1_dtype' to be a str") + elif n2_dtype is not None and type(n2_dtype) is not str: + raise TypeError("kmerdb.graph.KDBGReader expects the keyword argument 'n2_dtype' to be a str") + elif weights_dtype is not None and type(weights_dtype) is not str: + raise TypeError("kmerdb.graph.KDBGReader expects the keyword argument 'weights_dtype' to be a str") + + elif sort is not None and type(sort) is not bool: + raise TypeError("kmerdb.graph.KDBGReader expects the keyword argument 'sort' to be a bool") + elif slurp is not None and type(slurp) is not bool: + raise TypeError("kmerdb.graph.KDBGReader expects the keyword argument 'slurp' to be a bool") + + """ + + Handle fileobj or + """ + + if fileobj: + assert filename is None + handle = fileobj + assert "b" in handle.mode.lower() + else: + if "w" in mode.lower() or "a" in mode.lower(): + raise ValueError("Must use read mode (default), not write or append mode") + handle = _open(filename, "rb") + + self._text = "b" not in mode.lower() + if self._text: + self._newline = "\n" + else: + self._newline = b"\n" + self._handle = handle + self._filepath = self._handle.name + + # Placeholder + self.max_cache = 100 + + self._buffers = {} + self._block_start_offset = None + self._block_raw_length = None + + + """ + + np.array defs + + n1 + n2 + weights + + """ + + self.n1 = None + self.n1_dtype = None + self.n2 = None + self.n2_dtype = None + self.weights = None + self.weights_dtype = None + + self.read_and_validate_kdbg_header() + + if slurp is True: + logger.info("Reading .kdbg contents into memory") + self.slurp(n1_dtype=n1_dtype, n2_dtype=n2_dtype, weights_dtype=weights_dtype) + + self.is_int = True + handle.close() + self._handle.close() + self._handle = None + handle = None + fileobj=None + return + + def read_and_validate_kdbg_header(self): + + """ + + KDBGReader PyYAML metadata ingestion + + """ + + ''' + Here we want to load the metadata blocks. We want to load the first two lines of the file: the first line is the version, followed by the number of metadata blocks + ''' + # 0th block + logger.info("Loading the 0th block from '{0}'...".format(self._filepath)) + self._load_block(self._handle.tell()) + + self._buffer = self._buffer.rstrip(config.header_delimiter) + + initial_header_data = OrderedDict(yaml.safe_load(self._buffer)) + + # Placeholder + num_header_blocks = None + + if type(initial_header_data) is str: + logger.info("Inappropriate type for the header data.") + #logger.info("Um, what the fuck is this in my metadata block?") + raise TypeError("kmerdb.graph.KDBGReader could not parse the YAML formatted metadata in the first blocks of the file") + elif type(initial_header_data) is OrderedDict: + logger.info("Successfully parsed the 0th block of the file, which is expected to be the first block of YAML formatted metadata") + logger.info("Assuming YAML blocks until delimiter reached.") + + """ + + KDBGReader + + YAML metadata spec validation + """ + + if "version" not in initial_header_data.keys(): + raise TypeError("kmerdb.graph.KDBGReader couldn't validate the header YAML") + elif "metadata_blocks" not in initial_header_data.keys(): + raise TypeError("kmerdb.graph.KDBGReader couldn't validate the header YAML") + + logger.debug(initial_header_data) + + if initial_header_data["metadata_blocks"] != 1: + logger.error("More than 1 metadata block: uhhhhh are we loading any additional blocks") + raise IOError("Cannot read more than 1 metadata block yet") + + for i in range(initial_header_data["metadata_blocks"] - 1): + logger.info("Multiple metadata blocks read...") + self._load_block(self._handle.tell()) + logger.debug("Secondary blocks read") + addtl_header_data = yaml.safe_load(self._buffer.rstrip(config.header_delimiter)) + if type(addtl_header_data) is str: + logger.error(addtl_header_data) + logger.error("Couldn't determine this block.::/") + raise TypeError("kmerdb.graph.KDBGReader determined the data in the {0} block of the header data from '{1}' was not YAML formatted".format(i, self._filepath)) + elif type(addtl_header_data) is dict: + sys.stderr.write("\r") + sys.stderr.write("Successfully parsed {0} blocks of YAML formatted metadata".format(i)) + initial_header_data.update(addtl_header_data) + num_header_blocks = i + else: + logger.error(addtl_header_data) + raise RuntimeError("kmerdb.graph.KDBGReader encountered a addtl_header_data type that wasn't expected when parsing the {0} block from the .kdb file '{1}'.".format(i, self._filepath)) + + #raise RuntimeError("kmerdb.graph.KDBGReader encountered an unexpected type for the header_dict read from the .kdb header blocks") + + + logger.info("Reading additional blocks as YAML...") + logger.debug("Reading additional blocks as YAML...") + sys.stderr.write("\n") + logger.info("Validating the header data against the schema...") + try: + jsonschema.validate(instance=initial_header_data, schema=config.graph_schema) + self.metadata = dict(initial_header_data) + + self.k = self.metadata['k'] + self.n1_dtype = self.metadata["n1_dtype"] + self.n2_dtype = self.metadata["n2_dtype"] + self.weights_dtype = self.metadata["weights_dtype"] + self.sorted = self.metadata["sorted"] + logger.info("Self assigning dtype to uint64 probably") + logger.debug("Checking for metadata inference...") + except jsonschema.ValidationError as e: + logger.debug(e) + logger.error("kmerdb.graph.KDBGReader couldn't validate the header/metadata YAML from {0} header blocks".format(num_header_blocks)) + raise e + self.metadata["header_offset"] = self._handle.tell() + logger.debug("Handle set to {0} after reading header, saving as handle offset".format(self.metadata["header_offset"])) + #self._reader = gzip.open(self._filepath, 'r') + self._offsets = deque() + for values in bgzf.BgzfBlocks(self._handle): + #logger.debug("Raw start %i, raw length %i, data start %i, data length %i" % values) + self._offsets.appendleft(values) # raw start, raw length, data start, data length + if len(self._offsets) == 0: + raise IOError("kmerdb.graph.KDBGReader opened an empty file") + # Skip the zeroth block + self._load_block() + # print(str(self._buffer)) # 1 + # print(self.readline()) + # self._load_block() + # print(self._buffer) # 2 + # print(self.readline()) + + + def read_line(self): + """ + Read and parse a single line from the .kdbg file + + :returns: node1_id, node2_id, weight + :rtype: tuple + + """ + + line = self.readline() + + + if self.n1_dtype == "uint64" and self.n2_dtype == "uint64" and self.weights_dtype == "uint64": + return parse_kdbg_table_line(line, row_dtype="uint64") + else: + raise IOError("kmerdb.graph.KDBGReader.read_line Cannot determine file dtype") + + + + def slurp(self, n1_dtype:str="uint64", n2_dtype:str="uint64", weights_dtype:str="uint64"): + """ + Autoload the .kdbg file into memory + + :param n1_dtype: NumPy dtype + :type str: + :param n2_dtype: NumPy dtype + :type str: + :param weights_dtype: NumPy dtype + :type str: + + """ + + if type(n1_dtype) is not str: + raise TypeError("kmerdb.graph.KDBGReader.slurp expects n1_dtype to be a str") + if type(n2_dtype) is not str: + raise TypeError("kmerdb.graph.KDBGReader.slurp expects n2_dtype to be a str") + if type(weights_dtype) is not str: + raise TypeError("kmerdb.graph.KDBGReader.slurp expects weights_dtype to be a str") + + try: + np.dtype(n1_dtype) + np.dtype(n2_dtype) + np.dtype(weights_dtype) + except TypeError as e: + logger.error(e) + logger.error("kmerdb.graph.KDBGReader.slurp encountered a TypeError while assessing a numpy dtype") + raise TypeError("kmerdb.graph.KDBGReader.slurp expects each dtype keyword argument to be a valid numpy data type") + + vmem = psutil.virtual_memory() + + i = 0 + + idx = [] + arrs = [] + node1 = [] + node2 = [] + weights = [] + reading=True + while reading is True: + + try: + line = next(self) + + except StopIteration as e: + logger.error("Finished loading .kdbg through slurp (on init)") + raise e + if line is None: + logger.warning("Next returned None. Panic") + sys.exit(1) + raise RuntimeError("kmerdb.graph.KDBGReader.slurp Panicked on new line") + + + + try: + + arr = self.read_line() + arrs.append(arr) + + + i, n1, n2, weight = arr + + idx.append(i) + node1.append(n1) + node2.append(n2) + weights.append(weight) + except StopIteration as e: + reading = False + + + self.n1 = np.array(node1, dtype=n1_dtype) + self.n2 = np.array(node2, dtype=n2_dtype) + self.weights = np.array(weights, dtype=weights_dtype) + return + + + + +class KDBGWriter(bgzf.BgzfWriter): + """ + A wrapper class around Bio.bgzf.BgzfWriter to write a .kdbg file to disk. + + + :ivar filename: valid filepath + :ivar mode: read/write mode + :ivar metadata: header information + :ivar both_strands: *unimplemented* + :ivar fileobj: io.IOBase + :ivar compresslevel: compression parameter + """ + + def __init__(self, filename:str=None, mode:str="w", metadata:OrderedDict=None, both_strands:bool=False, fileobj:io.IOBase=None, compresslevel:int=6): + """ + A wrapper around Bio.bgzf.BgzfWriter + + :param filename: A valid filepath + :type filename: str + :param mode: read/write mode + :type mode: str + :param metadata: A metadata header for the .kdbg file + :type metadata: collections.OrderedDict + :param both_strands: *unimplemented* + :type both_strands: bool + :param compresslevel: compression parameter + :type compresslevel: int + :raise TypeError: filename was not a str + :raise TypeError: mode was invalid + :raise TypeError: both_strands was invalid + :raise TypeError: fileobj was invalid + :raise TypeError: compresslevel was invalid + :raise ValueError: mode was invalid + :raise NotImplementedError: append mode was invalid + """ + + if filename is None or type(filename) is not str: + raise TypeError("kmerdb.graph.KDBGWriter expects the filename to be a str") + elif mode is None or type(mode) is not str: + raise TypeError("kmerdb.graph.KDBGWriter expects the mode to be a str") + elif metadata is None or type(metadata) is not OrderedDict: + raise TypeError("kmerdb.graph.KDBGWriter - invalid metadata argument") + elif both_strands is None or type(both_strands) is not bool: + raise TypeError("kmerdb.graph.KDBGWriter expects the keyword argument 'both_strands' to be a bool") + elif fileobj is not None and not isinstance(fileobj, io.IOBase): + raise TypeError("kmerdb.graph.KDBGWriter expects the keyword argument 'fileobj' to be an instance of io.IOBase") + elif compresslevel is None or type(compresslevel) is not int: + raise TypeError("kmerdb.graph.KDBGWriter expects the keyword argument 'compresslevel' to be an int") + + if fileobj: + assert filename is None, "kmerdb.graph expects filename to be None is fileobj handle is provided" + handle = fileobj + else: + if "w" not in mode.lower() and "a" not in mode.lower(): + raise ValueError("Must use write or append mode, not %r" % mode) + elif "wb" == mode: + pass + elif "a" in mode.lower(): + raise NotImplementedError("Append mode is not implemented yet") + # handle = _open(filename, "ab") + else: + logger.error("Mode = {0}".format(mode)) + raise RuntimeError("Unknown mode for .kdbg file writing class kmerdb.graph.KDBGWriter") + self._text = "b" not in mode.lower() + self._handle = _open(filename, "wb") + self._buffer = b"" if "b" in mode.lower() else "" + self.compresslevel = compresslevel + + """ + Write the header to the file + """ + + + logger.info("Constructing a new .kdbg file '{0}'...".format(self._handle.name)) + + + # 3-04-2024 + yaml.add_representer(OrderedDict, util.represent_yaml_from_collections_dot_OrderedDict) + + self.metadata = metadata + + #self._write_block(metadata_slice) + if "b" in mode.lower(): + metadata_bytes = bytes(yaml.dump(self.metadata, sort_keys=False), 'utf-8') + metadata_plus_delimiter_in_bytes = metadata_bytes + bytes(config.header_delimiter, 'utf-8') + self.metadata["metadata_blocks"] = math.ceil( sys.getsizeof(metadata_plus_delimiter_in_bytes) / ( 2**16 ) ) # First estimate + metadata_bytes = bytes(yaml.dump(self.metadata, sort_keys=False), 'utf-8') + metadata_bytes = metadata_bytes + bytes(config.header_delimiter, 'utf-8') + self.metadata["metadata_blocks"] = math.ceil( sys.getsizeof(metadata_bytes) / ( 2**16 ) ) # Second estimate + metadata_bytes = bytes(yaml.dump(self.metadata, sort_keys=False), 'utf-8') + metadata_bytes = metadata_bytes + bytes(config.header_delimiter, 'utf-8') + logger.info("Writing the {0} metadata blocks to the new file".format(self.metadata["metadata_blocks"])) + logger.debug(self.metadata) + logger.debug("Header is being written as follows:\n{0}".format(yaml.dump(self.metadata, sort_keys=False))) + + # 01-01-2022 This is still not a completely functional method to write data to bgzf through the Bio.bgzf.BgzfWriter class included in BioPython + # I've needed to implement a basic block_writer, maintaining compatibility with the Biopython bgzf submodule. + #self.write(bytes(yaml.dump(metadata, sort_keys=False), 'utf-8')) + + for i in range(self.metadata["metadata_blocks"]): + metadata_slice = metadata_bytes[:65536] + metadata_bytes = metadata_bytes[65536:] + self._write_block(metadata_slice) + + #self._write_block + self._buffer = b"" + self._handle.flush() + elif "w" == mode.lower() or "x" == mode.lower(): + self.write(yaml.dump(metadata, sort_keys=False)) + self._buffer = "" + self._handle.flush() + else: + logger.error("Mode: {}".format(mode.lower())) + raise RuntimeError("Could not determine proper encoding for write operations to .kdb file") + + + + + + +class Parseable: + def __init__(self, arguments): + self.arguments = arguments + + + def parsefile(self, filename): + """Wrapper function for graph.parsefile to keep arguments succinct for deployment through multiprocessing.Pool + + :param filename: the filepath of the fasta(.gz)/fastq(.gz) to process with kmerdb.graph.parsefile + :type filename: str + """ + return parsefile(filename, self.arguments.k, quiet=self.arguments.quiet, b=self.arguments.fastq_block_size, both_strands=self.arguments.both_strands) + + + diff --git a/kmerdb/kmer.py b/kmerdb/kmer.py index 8e64b4d..e57492d 100644 --- a/kmerdb/kmer.py +++ b/kmerdb/kmer.py @@ -30,6 +30,11 @@ # ############################# +""" +a list of the unicode character encodings for the DNA alphabet +note 65 is A, 67 is C, 71 is G, 84 is T. +""" + letterToBinary={ # Unicode UTF-8 byte codes for ACGT 65: 0, 67: 1, @@ -40,6 +45,9 @@ standard_letters=set("ACTG") +""" +IUPAC support mappings for the k-mer counter (NOT USED IN THE ASSEMBLER) +""" IUPAC_TO_DOUBLET = { "R": ["A", "G"], "Y": ["C", "T"], @@ -56,6 +64,7 @@ "V": ["A", "C", "G"] } IUPAC_TRIPLET_CHARS=IUPAC_TO_TRIPLET.keys() +# *extremely* necessary variable n = "N" ############################# # @@ -78,20 +87,20 @@ class Kmers: """A wrapper class to pass variables through the multiprocessing pool :ivar k: The choice of k to shred with - :ivar strand_specific: Include k-mers from forward strand only + :ivar strand_specific: Include k-mers from forward strand only (TO BE DEPRECATED) :ivar verbose: print extra logging in the case of fasta files :ivar all_metadata: return extra metadata about sequence locations """ - def __init__(self, k, strand_specific=True, verbose=False, all_metadata=False): + def __init__(self, k, strand_specific:bool=True, verbose:bool=False, all_metadata:bool=False): """ :param k: The choice of k to shred with :type k: int - :param strand_specific: Include k-mers from forward strand only + :param strand_specific: Include k-mers from forward strand only (TO BE DEPRECATED) :type strand_specific: bool :param verbose: Whether or not to print extra logging in the case of fasta and not in the case of fastq :type verbose: bool - :param all_metadata: Whether or not to pass back extra metadata + :param all_metadata: Whether or not to pass back extra metadata (TO BE DEPRECATED) :type all_metadata: bool @@ -110,37 +119,105 @@ def __init__(self, k, strand_specific=True, verbose=False, all_metadata=False): self.all_metadata = all_metadata self.__permitted_chars = set("ACTGRYSWKMBDHVN") - def shred(self, seqRecord): + def validate_seqRecord_and_detect_IUPAC(self, seqRecord:Bio.SeqRecord.SeqRecord, is_fasta:bool=True, quiet_iupac_warning:bool=True): """ - Take a seqRecord fasta/fastq object and slice according to the IUPAC charset. - Doublets become replace with two counts, etc. + Helper method for validating seqRecord and warnings for non-standard IUPAC residues. + + :param seqRecord: a BioPython SeqRecord object + :type seqRecord: Bio.SeqRecord.SeqRecord + :param is_fasta: are the inputs ALL .fasta? + :type is_fasta: bool + :param quiet_iupac_warning: verbosity parameter + :type quiet_iupac_warning: bool + :raise ValueError: if *non* IUPAC symbols are detected in the seqRecord object. + :returns: a set of the letters detected, and length of the validated Bio.SeqRecord sequence + :rtype: tuple - :param seqRecord: - :type seqRecord: Bio.SeqRecord.SeqRecord - :returns: a parallel map - :rtype: dict """ + if type(quiet_iupac_warning) is not bool: + raise TypeError("kmerdb.kmer.validate_seqRecord_and_detect_IUPAC expects keyword argument 'quiet_iupac_warning' to be a bool") + elif not isinstance(seqRecord, Bio.SeqRecord.SeqRecord): + raise TypeError("kmerdb.kmer.validate_seqRecord_and_detect_IUPAC expects a Bio.SeqRecord.SeqRecord object as its first positional argument") + letters = set(seqRecord.seq) # This is ugly. Should really be explicitly cast to str + seqlen = len(seqRecord.seq) # `` Ditto. Should be explicitly cast to str + - - if not isinstance(seqRecord, Bio.SeqRecord.SeqRecord): - raise TypeError("kmerdb.kmer.Kmers expects a Bio.SeqRecord.SeqRecord object as its first positional argument") - letters = set(seqRecord.seq) - seqlen = len(seqRecord.seq) if seqlen < self.k: logger.error("Offending sequence ID: {0}".format(seqRecord.id)) - raise ValueError("kmerdb expects that each input sequence is longer than k.") + raise ValueError("kmerdb.kmer.validate_seqRecord_and_detect_IUPAC expects that each input sequence is longer than k.") + all_iupac_symbols = list(letters.intersection(self.__permitted_chars) - standard_letters) all_non_iupac_symbols = list(letters - self.__permitted_chars) + if len(all_non_iupac_symbols) > 0: - logger.warning("One or more unexpected characters in the {0} sequence".format("fasta" if self.verbose else "fastq")) + logger.warning("One or more unexpected characters in the {0} sequence".format("fasta" if self.is_fasta else "fastq")) logger.warning(list(letters - self.__permitted_chars)) raise ValueError("Non-IUPAC symbols detected in the fasta file") elif len(all_iupac_symbols) > 0: - logger.warning("Will completely refuse to include k-mers with 'N'") - logger.warning("All counts for k-mers including N will be discarded") - logger.warning("Other IUPAC symbols will be replaced with their respective pairs/triads") - logger.warning("And a count will be given to each, rather than a half count") + if quiet_iupac_warning is False: + logger.warning("Will completely refuse to include k-mers with 'N'") + logger.warning("All counts for k-mers including N will be discarded") + logger.warning("Other IUPAC symbols will be replaced with their respective pairs/triads") + logger.warning("And a count will be given to each, rather than a half count") + elif quiet_iupac_warning is True: + logger.warning("Suppressing warning that non-standard IUPAC residues (including N) are detected.") + return (letters, seqlen) + + + def _shred_for_graph(self, seqRecord:Bio.SeqRecord.SeqRecord): + """ + Introduced in 0.7.7, required for valid assembly. Simply omits the rejection of IUPAC non-standard residues. + + :param seqRecord: a BioPython sequence object to shred into k-mers + :type seqRecord: Bio.SeqRecord.SeqRecord + :raise RuntimeError: Non-IUPAC and non-standard IUPAC (other than ATGC+N) residues detected + :return: k-mer ids + :rtype: list + """ + kmers = [] + + letters, seqlen = self.validate_seqRecord_and_detect_IUPAC(seqRecord, quiet_iupac_warning=True) + + + # Iterate over *each* k-mer in the sequence by index + for i in range(seqlen - self.k + 1): + s = seqRecord.seq[i:(i+self.k)] # Creates the k-mer as a slice of a seqRecord.seq + # No non-standard IUPAC residues allowed for _shred for use in graph.py + nonstandard_iupac_symbols = list(set(s) - standard_letters) + non_iupac_symbols = list(set(s) - self.__permitted_chars) + + if len(non_iupac_symbols) > 1: + logger.error("Non-IUPAC symbols:") + logger.error(non_iupac_symbols) + raise RuntimeError("Non-IUPAC symbol(s) detected...") + elif len(nonstandard_iupac_symbols) == 0: + #logger.debug("Perfect sequence content (ATCG) detected...") + kmers.append(kmer_to_id(s)) + elif len(nonstandard_iupac_symbols) == 1 and iupac_symbols[0] == n: + #logger.debug("Only N-abnormal sequence content detected (aside from ATCG)...") + kmers.append(kmer_to_id(s)) + elif len(nonstandard_iupac_symbols) > 1: + logger.error("Assembly cannot continue with this k-mer, and it will be discarded, possibly affecting assembly process") + raise RuntimeError("Non-standard IUPAC symbols detected in .fa/.fq file...") + return kmers + + def shred(self, seqRecord): + """ + Take a seqRecord fasta/fastq object and slice according to the IUPAC charset. + Doublets become replace with two counts, etc. + + :param seqRecord: + :type seqRecord: Bio.SeqRecord.SeqRecord + :raise RuntimeError: No IUPAC characters detected or Non-IUPAC characters detected + :raise ValueError: Non-IUPAC characters detected + :returns: a dictionary of information about the Bio.SeqRecord shredded and the k-mers produced (including the IUPAC doublet/triplet expansion + :rtype: dict + + """ + letters, seqlen = self.validate_seqRecord_and_detect_IUPAC(seqRecord, quiet_iupac_warning=True) + + kmers = [] starts = [] reverses = [] @@ -184,20 +261,19 @@ def shred(self, seqRecord): logger.error("Full sequence above") logger.error("K-mer: {0}".format(s)) logger.error(s) - raise RuntimeError("Non-IUPAC character '{0}' made it into the IUPAC extension of kmerdb.kmer.Kmer.shred".format(c)) + raise RuntimeError("kmerdb.kmer.shred: Non-IUPAC character '{0}' made it into sequences generated from IUPAC doublet/triplet counting of the k-mer '{1}'".format(c, s)) if seqs is None and "N" not in s: - logger.debug(iupac_symbols) - logger.debug(non_iupac_symbols) - logger.error(s) - logger.error("The k-mer above did not have any IUPAC letters") - raise RuntimeError("No sequences generated in IUPAC module of kmerdb.kmer.Kmer.shred") + logger.warning("Permitted IUPAC symbols: {0}".format(iupac_symbols)) + logger.warning("Non-IUPAC symbols detected in sequence '{0}': {1}".format(s, non_iupac_symbols)) + logger.error("The following k-mer did not have any IUPAC letters: {0}".format(s)) + raise RuntimeError("kmerdb.kmer.shred: A sequence was rejected from non-IUPAC symbols") elif seqs is None and "N" in s: continue one_word = "".join(seqs) if len(set(one_word) - self.__permitted_chars) > 0: logger.error(one_word) - logger.error(seqs) - raise ValueError("Still IUPAC characters in the strings") + logger.error("Doublets/triplets produced from k-mer '{0}': \n{1}".format(s, seqs)) + raise ValueError("kmerdb.kmer.shred: at least one non-IUPAC symbol was found during doublets/triplet expansion of the k-mer '{0}'".format(s)) else: for x in seqs: logger.debug(x) @@ -217,12 +293,12 @@ def shred(self, seqRecord): elif len(non_iupac_symbols) > 0: logger.debug("TEST CONDITIONS") logger.debug("Non-permitted characters should be 0: {0}".format(len(set(str(s)) - self.__permitted_chars))) - logger.debug("IUPAC symbols should be >= 0: {0}".format(len(iupac_symbols))) + logger.debug("IUPAC symbols should be > 0: {0}".format(len(iupac_symbols))) logger.debug("Non-IUPAC characters should be 0: {0}".format(len(non_iupac_symbols))) logger.debug("Non-IUPAC symbols: {0}".format(non_iupac_symbols)) logger.debug("IUPAC symbols: {0}".format(iupac_symbols)) - logger.debug("K-mer: {0}".format(str(s))) - raise ValueError("Non-IUPAC symbol detected") + logger.debug("K-mer: '{0}'".format(str(s))) + raise ValueError("kmerdb.kmer.shred: Non-IUPAC symbol(s) detected") else: try: kmers.append(kmer_to_id(s)) @@ -235,13 +311,12 @@ def shred(self, seqRecord): reverses.append(True) starts.append(i) except KeyError as e: - logger.debug("One or more non-IUPAC letters found their way into a part of the code they're not supposed to go") - logger.debug("We officially support IUPAC but the statement challenging the sequence content failed, causing a genuine runtime error") - logger.debug("This caused the following KeyError") - logger.debug(e) + logger.warning("One or more non-IUPAC letters found their way into a part of the code they're not supposed to go") + logger.warning("We officially support IUPAC but the statement challenging the sequence content failed, causing a genuine runtime error") + logger.warning("This caused the following KeyError") + logger.warning(e) logger.error("Letters in the sequence: {0}".format(letters)) logger.errror("Permitted letters: {0}".format(self.__permitted_chars)) - logger.error("Unexpected behavior, rerun with -vv (DEBUG verbosity) to see more information") raise RuntimeError("IUPAC standard extra base pairs (R, B, etc.) or non-IUPAC characters detected in the sequence") del s if self.verbose: @@ -272,7 +347,9 @@ def kmer_to_id(s): :param s: The input k-mer as string :type s: str - :returns: The kPal-inspired binary encoding + :raise TypeError: str argument required, non-str type detected + :raise KeyError: Non-standard (ATCG) character detected + :returns: The kPal-inspired binary encoding (thanks!) :rtype: int """ @@ -306,10 +383,12 @@ def id_to_kmer(id, k): :type id: int :param k: The int k is used to byte convert the id to the sequence of characters :type k: int + :raise TypeError: int argument required, non-int type detected :returns: The k-mer as string :rtype: str """ if type(id) is not int: + logger.error("kmer.id_to_kmer was given the following type as an id") logger.error(type(id)) raise TypeError("kmerdb.id_to_kmer expects an int as its first positional argument") elif type(k) is not int: @@ -325,32 +404,98 @@ def id_to_kmer(id, k): return ''.join(kmer) -def neighbors(s, k): +def neighbors(kmer:str, kmer_id:int, k:int, quiet:bool=True): """ - Create the basic neighbors kmer_metadata dictionary. Soon to be deprecated. - :param s: The sequence as a string that will be sliced for k-mers - :type s: str + 3/11/24 revived. given a k-mer of length k, give its neighbors. + + This is so ugly. + + But rock on : + + + :param kmer: The sequence as a string that will be sliced for k-mers + :type kmer: str + :param kmer_id: The k-mer id for neighbor generation + :type kmer_id: int :param k: The int k :type k: int - :returns: The neighbors for a certain sequence s as a str - :rtype: dict + :raise TypeError: Requires either a Bio.SeqRecord or str + :raise TypeError: Requires k to be an int + :raise ValueError: k must match the length of the input k-mer + :returns: A list of 'neighboring' or 'adjacent' k-mers from derived from the input k-mer + :rtype: list """ - if not isinstance(s, str): + if not isinstance(kmer, str): raise TypeError("kmerdb.kmer.neighbors expects a Biopython Seq object as its first positional argument") elif type(k) is not int: raise TypeError("kmerdb.kmer.neighbors expects an int as its second positional argument") - elif len(s) != k: - raise TypeError("kmerdb.kmer.neighbors cannot calculate the {0}-mer neighbors of a {1}-mer".format(k, len(s))) + elif len(kmer) != k: + raise ValueError("kmerdb.kmer.neighbors cannot calculate the {0}-mer neighbors of a {1}-mer".format(k, len(s))) else: import copy letters1 = copy.deepcopy(binaryToLetter) letters2 = copy.deepcopy(letters1) - firstChar = s[0] - lastChar = s[-1] - suffix = s[1:] - prefix = s[:-1] - rightNeighborIds = dict((c, kmer_to_id(suffix+c)) for c in letters1) - leftNeighborIds = dict((c, kmer_to_id(c+prefix)) for c in letters2) - return {"suffixes": rightNeighborIds, "prefixes": leftNeighborIds} + firstCharRemoved = kmer[1:] + lastCharRemoved = kmer[:-1] + + new_type1 = list(map(lambda c: firstCharRemoved + c, letters1)) # Form the up neighbors + + new_type2 = list(map(lambda c: c + lastCharRemoved, letters2)) # Form the down neighbors + + """ + # TYPE 1: [first char removed ... + ... c : ['A', "c", "g", "T"]] + + # TYPE 2: [["A", "C", "G", "T"] : c + last char removed ] + """ + new_type1_ids = list(map(kmer_to_id, new_type1)) + new_type2_ids = list(map(kmer_to_id, new_type2)) + +# logger.debug(""" flower garden - joan G. Stark + +# wWWWw +# vVVVv (___) wWWWw wWWWw (___) vVVVv +# (___) ~Y~ (___) vVVVv ~Y~ (___) +# ~Y~ \| ~Y~ (___) |/ ~Y~ +# \| \ |/ \| / \~Y~/ \| \ |/ +# \\|// \\|// \\|/// \\|// \\|// \\\|/// +# jgs^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +# """, + +# """ computing the neighbor structure uwu ... + +# _(_)_ wWWWw _ +# @@@@ (_)@(_) vVVVv _ @@@@ (___) _(_)_ +# @@()@@ wWWWw (_)\ (___) _(_)_ @@()@@ Y (_)@(_) +# @@@@ (___) `|/ Y (_)@(_) @@@@ \|/ (_)\ +# / Y \| \|/ /(_) \| |/ | +# \ | \ |/ | / \ | / \|/ |/ \| \|/ +# jgs \\|// \\|/// \\\|//\\\|/// \|/// \\\|// \\|// \\\|// +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# """, +# "", + +# ) + + logger.debug("kmerdb.kmer.neighbors creating neighbors for k-mer '{0}'...".format(kmer)) + logger.debug("kmerdb.kmer.neighbors creating the neighbor structure for kmer : '{0}' \n: ==========\n'".format(kmer_id) + ", ".join(list(map(str, new_type1_ids)))) + logger.debug("kmerdb.kmer.neighbors creating the neighbor structure for kmer : '{0}' \n: ==========\n'".format(kmer_id) + ", ".join(list(map(str, new_type2_ids)))) + if quiet is not True: + sys.stderr.write(""" + k-id : {0} + kmer : \" {1} \" + + 'neighbors' + + {2} + {3} + + 'ids': + + {4} + {5} +""".format(kmer_id, kmer, new_type1, new_type2, new_type1_ids, new_type2_ids)) + #return {"appended_first_char_all_ommitted": new_type1_ids, "prepended_last_char_all_omitted": new_type2_ids} + return new_type1_ids + new_type2_ids diff --git a/kmerdb/parse.py b/kmerdb/parse.py index 5f41af6..becee4c 100644 --- a/kmerdb/parse.py +++ b/kmerdb/parse.py @@ -15,9 +15,11 @@ ''' - +import io import os import sys +import gzip +import hashlib import yaml import json import time @@ -26,6 +28,9 @@ from itertools import chain, repeat +from Bio import SeqIO + + # from sqlalchemy.orm import sessionmaker # from sqlalchemy.orm.attributes import flag_modified # from sqlalchemy.ext.declarative import declarative_base @@ -36,7 +41,7 @@ import numpy as np -from kmerdb import seqparser, kmer +from kmerdb import kmer import logging logger = logging.getLogger(__file__) @@ -56,45 +61,59 @@ def parsefile(filepath:str, k:int, rows_per_batch:int=100000, b:int=50000, n:int :type b: int :param stranded: Strand specificity argument for k-mer shredding process :type stranded: bool - :returns: (counts, header_dictionary, nullomers) header_dictionary is the file's metadata for the header block - :rtype: (numpy.ndarray, dict, list) + :raise TypeError: filepath was invalid + :raise OSError: filepath was invalid + :raise TypeError: k was invalid + :raise TypeError: rows_per_batch was invalid + :raise TypeError: b was invalid + :raise TypeError: n was invalid + :raise TypeError: both_strands was invalid + :raise TypeError: all_metadata was invalid + :raise TypeError: invalid dtype + :raise ValueError: invalid (None) kmer id detected + :raise ValueError: mismatched number of kmer_ids, associated sequence/read ids, starting locations, and reverse bools + :raise AssertionError: Error in nullomer count estimation + :returns: (counts, header_dictionary, nullomer_array, all_metadata) header_dictionary is the file's metadata for the header block + :rtype: (numpy.ndarray, dict, list, list) """ from kmerdb import kmer - if type(filepath) is not str: + if filepath is None or type(filepath) is not str: raise TypeError("kmerdb.parse.parsefile expects a str as its first positional argument") elif not os.path.exists(filepath): raise OSError("kmerdb.parse.parsefile could not find the file '{0}' on the filesystem".format(filepath)) - elif type(k) is not int: + elif k is None or type(k) is not int: raise TypeError("kmerdb.parse.parsefile expects an int as its second positional argument") - elif type(b) is not int: + elif rows_per_batch is None or type(rows_per_batch) is not int: + raise TypeError("kmerdb.parse.parsefile expects the keyword argument 'rows_per_batch' to be an int") + elif b is None or type(b) is not int: raise TypeError("kmerdb.parse.parsefile expects the keyword argument 'b' to be an int") - elif type(n) is not int: + elif n is None or type(n) is not int: raise TypeError("kmerdb.parse.parsefile expects the keyword argument 'n' to be an int") - elif type(both_strands) is not bool: + elif both_strands is None or type(both_strands) is not bool: raise TypeError("kmerdb.parse.parsefile expects the keyword argument 'both_strands' to be a bool") - elif type(all_metadata) is not bool: + elif all_metadata is None or type(all_metadata) is not bool: raise TypeError("kmerdb.parse.parsefile expects the keyword argument 'all_metadata' to be a bool") data = {} # This is the dictionary of tuples, keyed on k-mer id, and containing 3-tuples ('kmer_id', 'read/start/reverse') keys = set() rows = [] - total_kmers = 4**k + N = 4**k nullomers = set() # Build fasta/fastq parser object to stream reads into memory logger.debug("Initializing parser...") - seqprsr = seqparser.SeqParser(filepath, b, k) + seqprsr = SeqParser(filepath, b, k) fasta = not seqprsr.fastq # Look inside the seqprsr object for the type of file # Initialize the kmer array try: - counts = np.zeros(total_kmers, dtype="uint64") + counts = np.zeros(N, dtype="uint64") except TypeError as e: logger.error("Invalid dtype for numpy array instantiation") logger.error(e) raise e - logger.info("Successfully allocated space for {0} unsigned integers: {1} bytes".format(total_kmers, counts.nbytes)) + logger.info("Successfully allocated space for {0} unsigned integers: {1} bytes".format(N, counts.nbytes)) # Instantiate the kmer class Kmer = kmer.Kmers(k, strand_specific=not both_strands, verbose=fasta, all_metadata=all_metadata) # A wrapper class to shred k-mers with @@ -105,7 +124,7 @@ def parsefile(filepath:str, k:int, rows_per_batch:int=100000, b:int=50000, n:int else: logger.debug("Skipping the block size assertion for fasta files") logger.info("Read {0} sequences from the {1} seqparser object".format(len(recs), "fasta" if fasta else "fastq")) - all_kmer_metadata = list([] for x in range(total_kmers)) + all_kmer_metadata = list([] for x in range(N)) while len(recs): # While the seqprsr continues to produce blocks of reads num_recs = len(recs) @@ -182,7 +201,7 @@ def parsefile(filepath:str, k:int, rows_per_batch:int=100000, b:int=50000, n:int # Actually they were just introduced to be filtered out, instead of deleted # Because each deletion whould cange the array index # So instead we set ghtme to None, and filter out - raise ValueError("K-mer ids should never actually be none.") + raise ValueError("kmerdb.parse.parsefile encountered an invalid kmer_id. Internal error.") #logger.debug(kmer_ids) logger.debug("{0} 'clean' kmers were identified successfully from {1} input sequences".format(len(kmer_ids), num_recs)) @@ -204,8 +223,7 @@ def parsefile(filepath:str, k:int, rows_per_batch:int=100000, b:int=50000, n:int logger.error("{0} start positions for the associations found".format(len(starts))) logger.error("{0} reverse bools for each association".format(len(reverses))) - raise ValueError("Unexpectedly, the number of ids did not match up with the number of other metadata elements per k-mer OR other unknown error") - + raise ValueError("Unexpectedly, the number of ids did not match up with the number of other metadata elements per k-mer OR other unknown error. Internal error.") # else: # raise RuntimeError("Still have no clue what's going on...") @@ -216,7 +234,7 @@ def parsefile(filepath:str, k:int, rows_per_batch:int=100000, b:int=50000, n:int num_kmers = len(kmer_ids) if num_kmers == 0: - raise ValueError("No k-mers to add. Something likely went wrong. Please report to the issue tracker") + raise ValueError("No k-mers available to add. Please report to the issue tracker") else: sys.stderr.write("\nAccumulating all k-mers from this set of records...") for kmer in kmer_ids: @@ -226,26 +244,199 @@ def parsefile(filepath:str, k:int, rows_per_batch:int=100000, b:int=50000, n:int for single_kmer_id, read, start, reverse in kmer_metadata: all_kmer_metadata[single_kmer_id].append((read, start, reverse)) - recs = [r for r in seqprsr] # The next block of exactly 'b' reads - # This will be logged redundantly with the sys.stderr.write method calls at line 141 and 166 of seqparser.py (in the _next_fasta() and _next_fastq() methods) - #sys.stderr("\n") logger.info("Read {0} more records from the {1} seqparser object".format(len(recs), "fasta" if fasta else "fastq")) # Get nullomers # only nullomer counts unique_kmers = int(np.count_nonzero(counts)) - num_nullomers = total_kmers - unique_kmers + + + all_theoretical_kmer_ids = list(range(N)) + + + + # FIXME + num_nullomers = N - unique_kmers is_nullomer = np.where(counts == 0) - nullomers = [] - for i, j in enumerate(is_nullomer): - if j is True: - nullomers.append(i) + + + assert num_nullomers == len(is_nullomer[0]), "kmerdb.parse module find inconsistencies between two ways of counting nullomers. Internal error." + seqprsr.total_kmers = int(np.sum(counts)) seqprsr.unique_kmers = unique_kmers seqprsr.nullomers = num_nullomers + + + + seqprsr.nullomer_array = np.array(all_theoretical_kmer_ids, dtype="uint64")[is_nullomer] sys.stderr.write("\n\n\nFinished counting k-mers{0} from '{1}'...\n\n\n".format(' and metadata' if all_metadata else '', filepath)) - return counts, seqprsr.header_dict(), nullomers, all_kmer_metadata + return counts, seqprsr.header_dict(), seqprsr.nullomer_array, all_kmer_metadata + + + + + +class SeqParser: + """ + Largely useless module, needs 3 pieces of information passed back in from the outside. + This performs ugly decompression of fasta and fastq files, patching the __next__ methods, effectively. + It allows you to read either fasta or fastq data in blocks, obviously useful for the latter. + + + :ivar filepath: The .fastq, .fastq.gz, .fasta, .fasta.gz, .fna, .fna.gz, .fa, or .fa.gz file. + :ivar num: The number of records to read in from a .fastq + :ivar k: The choice of k to initialize the calculation of kmer/nullomer counts. + """ + def __init__(self, filepath, num, k): + """ + The SeqParser class wraps up some functionality + """ + + if type(filepath) is not str: + raise TypeError("kmerdb.parse.SeqParser expects a str as its first positional argument") + elif type(num) is not int: + raise TypeError("kmerdb.parse.SeqParser expects an int as its second positional argument") + elif type(k) is not int: + raise TypeError("kmerdb.parse.SeqParser expects an int as its third positional argument") + + self.k = k + self.num = num + self.reads = [] + # Header items + self.filepath = filepath + self.md5 = None + self.sha256 = None + self.total_reads = 0 + self.total_kmers = 0 + self.unique_kmers = 0 + self.nullomers = 0 + self.nullomer_array = [] + self.compressed = False + self.fastq = False + exts = os.path.splitext(filepath) + + if exts[-1] == ".gz": + self.compressed = True + nogzexts = os.path.splitext(exts[0]) + if nogzexts[-1] == ".fq" or nogzexts[-1] == ".fastq": + self.fastq = True + elif nogzexts[-1] == ".fna" or nogzexts[-1] == ".fasta" or exts[-1] == ".fa": + self.fastq = False + else: + raise ValueError("Cannot parse files of extension '{0}'.\n\nRequires fasta (.fna, .fasta, .fa), fastq (.fq, .fastq), or their gzipped equivalents") + else: # Must be fasta or fastq uncompressed + if exts[-1] == ".fq" or exts[-1] == ".fastq": + self.fastq = True + elif exts[-1] == ".fna" or exts[-1] == ".fasta" or exts[-1] == ".fa": + self.fastq = False + else: + raise ValueError("Cannot parse files of extension '{0}'.\n\nRequires fasta (.fna, .fasta, .fa), fastq (.fq, .fastq), or their gzipped equivalents") + + # This is a really ugly patch to add appropriate fastq and fasta next behavior. + if self.fastq: + self.__class__.__iter__ = self._iter_fastq + self.__class__.__next__ = self._next_fastq + else: + self.__class__.__iter__ = self._iter_fasta + self.__class__.__next__ = self._next_fasta + + if self.compressed: + if self.fastq: + logger.info("Opening gzipped fastq file '{0}'...".format(filepath)) + self._handle = SeqIO.parse(gzip.open(self.filepath, 'rt'), "fastq") + else: + logger.info("Opening gzipped fasta file '{0}'...".format(filepath)) + self._handle = SeqIO.parse(gzip.open(self.filepath, 'rt'), "fasta") + else: + if self.fastq: + logger.info("Opening uncompressed fastq file '{0}'...".format(filepath)) + self._handle = SeqIO.parse(open(self.filepath, 'r'), "fastq") + else: + logger.info("Opening uncompressed fasta file '{0}'...".format(filepath)) + self._handle = SeqIO.parse(open(self.filepath, 'r'), "fasta") + # Get checksums + self.md5, self.sha256 = self.__checksum() + + + def __checksum(self): + """Generates md5 and sha256 checksums of a file + :returns: (md5, sha256) + :rtype: tuple + """ + if not os.path.exists(self.filepath): + raise IOError("kmerdb.parse.SeqParser.__checksum could not find '{}' on the filesystem".format(filepath)) + hash_md5 = hashlib.md5() + hash_sha256 = hashlib.sha256() + with open(self.filepath, 'rb') as ifile: + for chunk in iter(lambda: ifile.read(4096), b""): + hash_md5.update(chunk) + hash_sha256.update(chunk) + return (hash_md5.hexdigest(), hash_sha256.hexdigest()) + + def header_dict(self): + """ Create a header dictionary to convert into YAML to go in the header block(s) of the compression header. Has a schema to be validated, defined in config.py + + :returns: dict + :rtype: dict + + """ + return { + "filename": self.filepath, + "md5": self.md5, + "sha256": self.sha256, + "total_reads": self.total_reads, + "total_kmers": self.total_kmers, + "unique_kmers": self.unique_kmers or 4**self.k - self.nullomers, + "nullomers": self.nullomers, + } + + def __exit__(self, exc_type, exc_value, traceback): + self._handle.close() + + def __enter__(self): + return self + + + def _iter_fastq(self): + """A custom iterator method to add to the 'reads' array as iterated upon. + """ + try: + for i in range(self.num): + self.reads.append(next(self._handle)) + except StopIteration as e: + pass + except ValueError as e: + logger.error(e) + logger.error("\n\nFastq format error: '{0}' seems to not be fastq format\n\n") + raise e + return self + + def _next_fastq(self): + """ + A custom mononucleotide counter + + """ + if not len(self.reads): + raise StopIteration + else: + self.total_reads += 1 + read = self.reads.pop() + return read + + def _iter_fasta(self): + return self + + def _next_fasta(self): + + seq = next(self._handle) + self.total_reads += 1 + sys.stderr.write("Read {0} sequences from '{1}'...\n".format(self.total_reads, self.filepath)) + return seq + + + + @@ -258,7 +449,7 @@ def __init__(self, arguments): def parsefile(self, filename): """Wrapper function for parse.parsefile to keep arguments succinct for deployment through multiprocessing.Pool - :param filename: the filepath of the fasta(.gz)/fastq(.gz) to process with parsefile -> seqparser + :param filename: the filepath of the fasta(.gz)/fastq(.gz) to process with parsefile -> parse.SeqParser :type filename: str :returns: (db, m, n) """ diff --git a/kmerdb/seqparser.py b/kmerdb/seqparser.py deleted file mode 100644 index 49c01c5..0000000 --- a/kmerdb/seqparser.py +++ /dev/null @@ -1,186 +0,0 @@ -''' - Copyright 2020 Matthew Ralston - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - -''' - - - -import logging -logger = logging.getLogger(__file__) - - -import io -import os -import sys -import gzip -import hashlib - -from Bio import SeqIO -import Bio - -class SeqParser: - """ - Largely useless module, needs 3 pieces of information passed back in from the outside. - This performs ugly decompression of fasta and fastq files, patching the __next__ methods, effectively. - It allows you to read either fasta or fastq data in blocks, obviously useful for the latter. - - - :ivar filepath: The .fastq, .fastq.gz, .fasta, .fasta.gz, .fna, .fna.gz, .fa, or .fa.gz file. - :ivar num: The number of records to read in from a .fastq - :ivar k: The choice of k to initialize the calculation of kmer/nullomer counts. - """ - def __init__(self, filepath, num, k): - if type(filepath) is not str: - raise TypeError("kdb.seqrecord.SeqParser expects a str as its first positional argument") - elif type(num) is not int: - raise TypeError("kmerdb.seqparser.SeqParser expects an int as its second positional argument") - elif type(k) is not int: - raise TypeError("kmerdb.seqparser.SeqParser expects an int as its third positional argument") - - self.k = k - self.num = num - self.reads = [] - # Header items - self.filepath = filepath - self.md5 = None - self.sha256 = None - self.total_reads = 0 - self.total_kmers = 0 - self.unique_kmers = 0 - self.nullomers = 0 - self.compressed = False - self.fastq = False - exts = os.path.splitext(filepath) - - if exts[-1] == ".gz": - self.compressed = True - nogzexts = os.path.splitext(exts[0]) - if nogzexts[-1] == ".fq" or nogzexts[-1] == ".fastq": - self.fastq = True - elif nogzexts[-1] == ".fna" or nogzexts[-1] == ".fasta" or exts[-1] == ".fa": - self.fastq = False - else: - raise ValueError("Cannot parse files of extension '{0}'.\n\nRequires fasta (.fna, .fasta, .fa), fastq (.fq, .fastq), or their gzipped equivalents") - else: # Must be fasta or fastq uncompressed - if exts[-1] == ".fq" or exts[-1] == ".fastq": - self.fastq = True - elif exts[-1] == ".fna" or exts[-1] == ".fasta" or exts[-1] == ".fa": - self.fastq = False - else: - raise ValueError("Cannot parse files of extension '{0}'.\n\nRequires fasta (.fna, .fasta, .fa), fastq (.fq, .fastq), or their gzipped equivalents") - - # This is a really ugly patch to add appropriate fastq and fasta next behavior. - if self.fastq: - self.__class__.__iter__ = self._iter_fastq - self.__class__.__next__ = self._next_fastq - else: - self.__class__.__iter__ = self._iter_fasta - self.__class__.__next__ = self._next_fasta - - if self.compressed: - if self.fastq: - logger.info("Opening gzipped fastq file '{0}'...".format(filepath)) - self._handle = SeqIO.parse(gzip.open(self.filepath, 'rt'), "fastq") - else: - logger.info("Opening gzipped fasta file '{0}'...".format(filepath)) - self._handle = SeqIO.parse(gzip.open(self.filepath, 'rt'), "fasta") - else: - if self.fastq: - logger.info("Opening uncompressed fastq file '{0}'...".format(filepath)) - self._handle = SeqIO.parse(open(self.filepath, 'r'), "fastq") - else: - logger.info("Opening uncompressed fasta file '{0}'...".format(filepath)) - self._handle = SeqIO.parse(open(self.filepath, 'r'), "fasta") - # Get checksums - self.md5, self.sha256 = self.__checksum() - - - def __checksum(self): - """Generates md5 and sha256 checksums of a file - :returns: (md5, sha256) - :rtype: tuple - """ - if not os.path.exists(self.filepath): - raise IOError("kmerdb.seqparser.FastqParser.__checksum could not find '{}' on the filesystem".format(filepath)) - hash_md5 = hashlib.md5() - hash_sha256 = hashlib.sha256() - with open(self.filepath, 'rb') as ifile: - for chunk in iter(lambda: ifile.read(4096), b""): - hash_md5.update(chunk) - hash_sha256.update(chunk) - return (hash_md5.hexdigest(), hash_sha256.hexdigest()) - - def header_dict(self): - """ Create a header dictionary to convert into YAML to go in the header block(s) of the compression header. Has a schema to be validated, defined in config.py - - :returns: dict - :rtype: dict - - """ - return { - "filename": self.filepath, - "md5": self.md5, - "sha256": self.sha256, - "total_reads": self.total_reads, - "total_kmers": self.total_kmers, - "unique_kmers": self.unique_kmers or 4**self.k - self.nullomers, - "nullomers": self.nullomers, - } - - def __exit__(self, exc_type, exc_value, traceback): - self._handle.close() - - def __enter__(self): - return self - - - def _iter_fastq(self): - """A custom iterator method to add to the 'reads' array as iterated upon. - """ - try: - for i in range(self.num): - self.reads.append(next(self._handle)) - except StopIteration as e: - pass - except ValueError as e: - logger.error(e) - logger.error("\n\nFastq format error: '{0}' seems to not be fastq format\n\n") - raise e - return self - - def _next_fastq(self): - """ - A custom mononucleotide counter - - """ - if not len(self.reads): - raise StopIteration - else: - self.total_reads += 1 - read = self.reads.pop() - return read - - def _iter_fasta(self): - return self - - def _next_fasta(self): - - seq = next(self._handle) - self.total_reads += 1 - sys.stderr.write("Read {0} sequences from '{1}'...\n".format(self.total_reads, self.filepath)) - return seq - - - diff --git a/kmerdb/util.py b/kmerdb/util.py index 86ba208..b01f48f 100644 --- a/kmerdb/util.py +++ b/kmerdb/util.py @@ -25,8 +25,12 @@ from collections import deque, OrderedDict +from kmerdb import config -def represent_ordereddict(dumper, data): + + + +def represent_yaml_from_collections_dot_OrderedDict(dumper, data): """ Thanks to Blender and EquipDev on StackOverflow for this handy method to pass to yaml.add_representer https://stackoverflow.com/a/16782282 @@ -40,6 +44,8 @@ def represent_ordereddict(dumper, data): :param data: :type data: dict """ + + value = [] for item_key, item_value in data.items(): @@ -51,20 +57,3 @@ def represent_ordereddict(dumper, data): return yaml.nodes.MappingNode(u'tag:yaml.org,2002:map', value) -def merge_metadata_lists(k, meta_metadata_across_all_files, new_kmer_meta_metadata): - """ - Merge two 4**k metadata lists. - :param k: The choice of k is important for count array indexing. - :type k: int - :param meta_metadata_across_all_files: - :type meta_metadata_across_all_files: list - :param new_kmer_meta_metadata: - :type new_kmer_meta_metadata: list - """ - if 4**k != len(new_kmer_meta_metadata): - raise TypeError("kmerdb.util.merge_metadata_lists() expects a new_kmer_metadata list to have 4^{0} elements. Got {1}".format(k, len(new_kmer_meta_metadata))) - - for i, meta_metadata in enumerate(new_kmer_meta_metadata): - meta_metadata_across_all_files[i] = meta_metadata_across_all_files[i] + meta_metadata - - return meta_metadata_across_all_files diff --git a/pyproject.toml b/pyproject.toml index 3fb858c..86f45c6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,12 +15,15 @@ [build-system] -requires = ["setuptools>=61.0", "wheel", "numpy>=1.22.0", "Cython>=0.29.21"] + +# setuptools 65.5.0 + +requires = ["setuptools>=61.0", "wheel", "numpy>=1.22.0", "Cython>=3.0.8"] build-backend = "setuptools.build_meta" [project] name = "kmerdb" -version = "0.7.6" +version = "0.7.8" description = "Yet another kmer library for Python" readme = "README.md" authors = [{name="Matt Ralston ", email="mralston.development@gmail.com"}] @@ -44,81 +47,123 @@ classifiers = [ ] keywords = ["bioinformatics", "fastq", "fasta", "k-mer", "kmer", "k-merdb", "kmerdb", "kdb"] dependencies = [ - ######################################### - # AWS - ######################################### - #'boto3==1.20.25', - #'botocore==1.23.25', - #'s3transfer==0.5.0', - ######################################### - # BioPython - ######################################### - 'biopython>=1.79', - - ######################################### - # Cython - ######################################### - 'cython>=0.29.26', - - ######################################### - # Distlib - ######################################### - 'distlib>=0.3.4', - - - ######################################### - # Graphics - ######################################### - 'matplotlib>=3.6.2', - - - ######################################### - # Scientific computing - ######################################### - #'cupy>=11.0.0', - #'numba==0.52.0', - - # Old version requirements. Incorrect Comparison in NumPy #5 under security/ - #'numpy>=1.21.6', - #'pandas>=1.3.5', - #'scipy>=1.7.3', - #'sklearn>=0.0.post1', - - #'numpy>=1.24.1', - 'numpy>=1.22.0', - 'pandas>=1.3.5', - 'scipy>=1.9.3', - 'scikit-learn>=1.4.0', - - ####################### - # Statistical language - ####################### - 'rpy2>=3.5.6', - - #################### - # Graph schema - #################### - #'rdflib==5.0.0', - #'rdflib-jsonld==0.5.0', - - ######################################### - # Configurations - ######################################### - 'PyYAML>=6.0', - 'jsonschema>=4.17.3', - - ######################################### - # Micellaneous - ######################################### - 'psutil>=5.9.4', - #'more-itertools==8.2.0' - - ######################################### - # urllib3 - ######################################### - # Note: this program does not connect to the network AT ALL - # but s3: downloads were supported for the 'kmerdb profile' subcommand - #urllib3==1.26.7 + +# h e a d e r + +######################################### + +# Build + +######################################### +"build>=0.9.0", + +######################################### + +# Setuptools + +# setuptools>=65.5.0 + +######################################### + +# Distlib + +######################################### +"distlib>=0.3.8", + + + +# --------------------- + +# Bio + +######################################### +"biopython>=1.83", + +######################################### + +# Cython + +######################################### +"Cython>=3.0.8", + + + + +######################################### + +# Documentation + +######################################### +"docutils>=0.17", +#docutils==0.18.1 + +"sphinx>=4.3.2", +#sphinxcontrib-htmlhelp>=2.0.0 +#sphinxcontrib-serializinghtml>=1.1.5 +#sphinx-rtd-theme>=1.0.0 + + +######################################### + +# Graphics + +######################################### +"matplotlib>=3.8.2", + + +######################################### + +# Scientific computing + +######################################### +#cupy>=11.0.0 +#numba==0.52.0 +"numpy==1.26.3", +"pandas>=2.2.0", +"scipy>=1.12.0", +"scikit-learn==1.4.0", + +####################### +# Statistical language +"rpy2>=3.4.2", +####################### + + + +#################### +# Graph schema +#rdflib==5.0.0 +#rdflib-jsonld==0.5.0 + +######################################### + +# Configurations + +######################################### +"PyYAML>=6.0.1", +"jsonschema>=4.21.1", + + + +######################################### + +# Micellaneous + +######################################### +"psutil>=5.9.8", +#more-itertools==8.2.0 + + + + + +######################################### + +# urllib3 + +######################################### +#urllib3==1.26.7 + ] requires-python = ">=3.8.0" diff --git a/requirements.txt b/requirements.txt index 0753962..0c1cdfe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,5 @@ -######################################### - -# AWS -######################################### -#boto3==1.20.25 -#botocore==1.23.25 -#s3transfer==0.5.0 +# h e a d e r ######################################### @@ -16,25 +10,33 @@ build>=0.9.0 ######################################### -# Bio +# Setuptools -######################################### -biopython>=1.79 +# setuptools>=65.5.0 ######################################### -# Cython +# Distlib ######################################### -cython>=0.29.26 +distlib>=0.3.8 + + +# --------------------- + +# Bio ######################################### +biopython>=1.83 -# Distlib +######################################### + +# Cython ######################################### -distlib>=0.3.4 +Cython>=3.0.8 + @@ -57,7 +59,7 @@ sphinx>=4.3.2 # Graphics ######################################### -matplotlib>=3.5.1 +matplotlib>=3.8.2 ######################################### @@ -67,10 +69,12 @@ matplotlib>=3.5.1 ######################################### #cupy>=11.0.0 #numba==0.52.0 -numpy==1.22.0 -pandas>=1.3.5 -scipy>=1.9.3 -sklearn>=0.0.post1 +numpy==1.26.3 +pandas>=2.2.0 +scipy>=1.12.0 +scikit-learn==1.4.0 +networkx==3.2.1 + ####################### # Statistical language @@ -89,8 +93,8 @@ rpy2>=3.4.2 # Configurations ######################################### -PyYAML>=6.0 -jsonschema>=4.17.3 +PyYAML>=6.0.1 +jsonschema>=4.21.1 @@ -99,7 +103,7 @@ jsonschema>=4.17.3 # Micellaneous ######################################### -psutil>=5.9.4 +psutil>=5.9.8 #more-itertools==8.2.0 diff --git a/setup.py b/setup.py index a444241..5694ba8 100644 --- a/setup.py +++ b/setup.py @@ -113,7 +113,7 @@ def can_import(module_name): EMAIL = 'mrals89@gmail.com' AUTHOR = 'Matthew Ralston' REQUIRES_PYTHON = '>=3.8.0' -VERSION = "0.7.6" +VERSION = "0.7.8" KEYWORDS = ["bioinformatics", "fastq", "fasta", "k-mer", "kmer", "k-merdb", "kmerdb", "kdb"], CLASSIFIERS = [ "Development Status :: 1 - Planning",