Skip to content

Commit 774feeb

Browse files
author
Daniel Mapleson
committed
Added ability to detect file format from sequence file without known extension.
1 parent 2e54b67 commit 774feeb

File tree

3 files changed

+48
-0
lines changed

3 files changed

+48
-0
lines changed

lib/include/kat/input_handler.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ namespace kat {
6767

6868
static shared_ptr<vector<path>> globFiles(const string& input);
6969
static shared_ptr<vector<path>> globFiles(const vector<path>& input);
70+
71+
static string determineSequenceFileType(const path& file);
7072

7173
private:
7274
static int globerr(const char *path, int eerrno);

lib/src/input_handler.cc

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@
2020
#endif
2121

2222
#include <iostream>
23+
#include <fstream>
2324
#include <glob.h>
25+
using std::fstream;
2426
using std::stringstream;
2527

2628
#include <boost/filesystem.hpp>
@@ -259,3 +261,37 @@ shared_ptr<vector<path>> kat::InputHandler::globFiles(const vector<path>& input)
259261

260262
return globbed;
261263
}
264+
265+
string kat::InputHandler::determineSequenceFileType(const path& filename) {
266+
267+
string ext = filename.extension().string();
268+
269+
// Check extension first
270+
if (boost::iequals(ext, ".fastq") || boost::iequals(ext, ".fq")) {
271+
return "fastq";
272+
}
273+
else if ( boost::iequals(ext, ".fasta") ||
274+
boost::iequals(ext, ".fa") ||
275+
boost::iequals(ext, ".fna") ||
276+
boost::iequals(ext, ".fas") ||
277+
boost::iequals(ext, ".scafSeq")) {
278+
return "fasta";
279+
}
280+
else {
281+
// Now check first character of the file
282+
char ch;
283+
fstream fin(filename.string(), fstream::in);
284+
fin >> ch;
285+
fin.close();
286+
287+
if (ch == '>') {
288+
return "fasta";
289+
}
290+
else if (ch == '@') {
291+
return "fastq";
292+
}
293+
}
294+
295+
// If we've got this far then it's not obviously a sequence file we recognise.
296+
BOOST_THROW_EXCEPTION(InputFileException() << InputFileErrorInfo("Unknown file type"));
297+
}

tests/check_jellyfish.cc

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,9 @@ template<typename DtnType>
2828
inline double as_seconds(DtnType dtn) { return duration_cast<duration<double>>(dtn).count(); }
2929

3030
#include <kat/jellyfish_helper.hpp>
31+
#include <kat/input_handler.hpp>
3132
using kat::JellyfishHelper;
33+
using kat::InputHandler;
3234
using kat::HashLoader;
3335

3436
namespace kat {
@@ -209,4 +211,12 @@ TEST(jellyfish, unknownexttest) {
209211
EXPECT_EQ( res, true );
210212
}
211213

214+
TEST(jellyfish, determineexttest) {
215+
path unknownpath = path(DATADIR "/unknown.dat");
216+
217+
string ft = InputHandler::determineSequenceFileType(unknownpath);
218+
219+
EXPECT_EQ( ft, "fasta" );
220+
}
221+
212222
}

0 commit comments

Comments
 (0)