use ExtUtils::testlib; use Data::Dumper; use HDFPerl; use strict; use Init; use POSIX; package BioHDF_Perl; use constant ID_LENGTH => 25; use constant LIN_SEARCH_BUF_SIZE => 1000; use constant DESCRIPTION_SIZE => 50; use constant NAME_SIZE => 20; use constant CHUNK_SIZE => 5000; # levels 0-9. lower levels are faster but result in less compression use constant COMPRESSION_LEVEL => 7; ################################################ # # HIGH LEVEL APIS FOR FASTA DATA MANAGEMENT IN # HDF5 FILE # ################################################ # This function creates an HDF5 sequence file. # INPUT: file name # RETURN: file identifier sub create_sequence_file { my $hdf_filename = shift; my $fid = HDFPerl::h5fcreate_p($hdf_filename, $Init::H5F_ACC_TRUNC, $Init::H5P_DEFAULT, $Init::H5P_DEFAULT); return $fid; } # This function opens an HDF5 sequence file. # INPUT: file name # RETURN: file identifier sub open_sequence_file { my $hdf_filename = shift; my $fid = HDFPerl::h5fopen_p($hdf_filename, $Init::H5F_ACC_RDWR, $Init::H5P_DEFAULT); return $fid; } # This function gets the description of a collection. # INPUT: reference to array containing objects identifiers # RETURN: collection description sub get_collection_description { my $collection_ref = shift; my @collection = @{$collection_ref}; my $description = HDFPerl::h5gget_comment_p($collection[0], ".", DESCRIPTION_SIZE); return $description; } # This function looks for collections names in an HDF5 sequence file. # INPUT: file identifier # RETURN: reference to array containing collections names. On error, it # returns a negative value sub get_sequence_collections { my $fid = shift; # get number of collections in sequence file my $number = HDFPerl::h5gget_num_objs_p($fid); if ($number < 0) { return $number; } my $i; my @names=(); # iterate on collections for ($i=0; $i<$number; $i++){ $names[$i] = HDFPerl::h5gget_objname_by_idx_p($fid, $i, NAME_SIZE); } return \@names; } # This function gets the sequence identifiers for a given collection. # INPUT: reference to array containing objects identifiers # # RETURN: reference to array containing sequence identifiers. On error, # it returns a negative value sub get_sequence_ids { my $collection_ref = shift; my @collection = @{$collection_ref}; my $sid = HDFPerl::h5dget_space_p($collection[0]); # read "id" field from all sequences in the collection my $stid = HDFPerl::h5tcreate_string_p(ID_LENGTH); my $mtid = HDFPerl::h5tcreate_compound_p(ID_LENGTH); HDFPerl::h5tinsert_p($mtid, "id", 0, $stid); my $ref = HDFPerl::h5dread_string_p($collection[0], $mtid, $sid, $sid, $Init::H5P_DEFAULT); HDFPerl::h5tclose_p($mtid); HDFPerl::h5tclose_p($stid); HDFPerl::h5sclose_p($sid); return $ref; } # This function sorts the "ids" dataset with respect to the "id" field. # Sorting is necessary to perform binary searches. At this point, the sorting # of the entire dataset is performed in memory. # INPUT: collection ids # # RETURN: status sub sort_sequence_collection { my $collection_ref = shift; my @collection = @{$collection_ref}; my @ids_buf=(); my %ids_hash=(); my @unit_array=(1); my @status=(); # read "id" field from all sequences in the collection my $sid = HDFPerl::h5dget_space_p($collection[0]); my $stid = HDFPerl::h5tcreate_string_p(ID_LENGTH); my $cid1 = HDFPerl::h5tcreate_compound_p(ID_LENGTH); HDFPerl::h5tinsert_p($cid1, "id", 0, $stid); $ids_buf[0] = HDFPerl::h5dread_string_p($collection[0], $cid1, $sid, $sid, $Init::H5P_DEFAULT); HDFPerl::h5tclose_p($stid); # read "index" field from all sequences in the collection my $intsize = HDFPerl::h5tget_size_p($Init::H5T_NATIVE_INT); my $cid2 = HDFPerl::h5tcreate_compound_p($intsize); HDFPerl::h5tinsert_p($cid2, "index", 0, $Init::H5T_NATIVE_INT); $ids_buf[1] = HDFPerl::h5dread_int_p($collection[0], $cid2, $sid, $sid, $Init::H5P_DEFAULT); my $i; my $id; # prepare hash for sorting for ($i=0; $i