Index: src/H5AC.c =================================================================== --- src/H5AC.c (revision 25532) +++ src/H5AC.c (working copy) @@ -4137,11 +4137,13 @@ HDassert( aux_ptr->metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED ); +#ifndef WILDCARD /* to prevent "messages from the future" we must synchronize all * processes before we write any entries. */ if(MPI_SUCCESS != (mpi_code = MPI_Barrier(aux_ptr->mpi_comm))) HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed 1", mpi_code) +#endif if(aux_ptr->mpi_rank == 0) { if(H5AC_broadcast_candidate_list(cache_ptr, &num_candidates, &candidates_list_ptr) < 0) @@ -4636,9 +4638,11 @@ if(H5AC_copy_candidate_list_to_buffer(cache_ptr, &num_entries, &haddr_buf_ptr, NULL, NULL) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't construct candidate buffer.") +#ifndef WILDCARD /* initial sync point barrier */ if(MPI_SUCCESS != (mpi_code = MPI_Barrier(aux_ptr->mpi_comm))) HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed 1", mpi_code) +#endif /* apply the candidate list */ aux_ptr->write_permitted = TRUE; Index: src/H5ACprivate.h =================================================================== --- src/H5ACprivate.h (revision 25532) +++ src/H5ACprivate.h (working copy) @@ -145,6 +145,8 @@ #define H5AC_NOTIFY_ACTION_AFTER_INSERT H5C_NOTIFY_ACTION_AFTER_INSERT #define H5AC_NOTIFY_ACTION_BEFORE_EVICT H5C_NOTIFY_ACTION_BEFORE_EVICT +typedef H5C_collective_write_t H5AC_collective_write_t; + typedef H5C_load_func_t H5AC_load_func_t; typedef H5C_flush_func_t H5AC_flush_func_t; typedef H5C_dest_func_t H5AC_dest_func_t; Index: src/H5B2cache.c =================================================================== --- src/H5B2cache.c (revision 25532) +++ src/H5B2cache.c (working copy) @@ -70,17 +70,17 @@ /* Metadata cache callbacks */ static H5B2_hdr_t *H5B2__cache_hdr_load(H5F_t *f, hid_t dxpl_id, haddr_t addr, void *udata); -static herr_t H5B2__cache_hdr_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5B2_hdr_t *hdr, unsigned UNUSED * flags_ptr); +static herr_t H5B2__cache_hdr_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5B2_hdr_t *hdr, unsigned UNUSED * flags_ptr, H5SL_t *collective_write_list); static herr_t H5B2__cache_hdr_dest(H5F_t *f, H5B2_hdr_t *hdr); static herr_t H5B2__cache_hdr_clear(H5F_t *f, H5B2_hdr_t *hdr, hbool_t destroy); static herr_t H5B2__cache_hdr_size(const H5F_t *f, const H5B2_hdr_t *hdr, size_t *size_ptr); static H5B2_internal_t *H5B2__cache_internal_load(H5F_t *f, hid_t dxpl_id, haddr_t addr, void *udata); -static herr_t H5B2__cache_internal_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5B2_internal_t *i, unsigned UNUSED * flags_ptr); +static herr_t H5B2__cache_internal_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5B2_internal_t *i, unsigned UNUSED * flags_ptr, H5SL_t *collective_write_list); static herr_t H5B2__cache_internal_dest(H5F_t *f, H5B2_internal_t *internal); static herr_t H5B2__cache_internal_clear(H5F_t *f, H5B2_internal_t *i, hbool_t destroy); static herr_t H5B2__cache_internal_size(const H5F_t *f, const H5B2_internal_t *i, size_t *size_ptr); static H5B2_leaf_t *H5B2__cache_leaf_load(H5F_t *f, hid_t dxpl_id, haddr_t addr, void *udata); -static herr_t H5B2__cache_leaf_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5B2_leaf_t *l, unsigned UNUSED * flags_ptr); +static herr_t H5B2__cache_leaf_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5B2_leaf_t *l, unsigned UNUSED * flags_ptr, H5SL_t *collective_write_list); static herr_t H5B2__cache_leaf_dest(H5F_t *f, H5B2_leaf_t *leaf); static herr_t H5B2__cache_leaf_clear(H5F_t *f, H5B2_leaf_t *l, hbool_t destroy); static herr_t H5B2__cache_leaf_size(const H5F_t *f, const H5B2_leaf_t *l, size_t *size_ptr); @@ -274,7 +274,8 @@ */ static herr_t H5B2__cache_hdr_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, - H5B2_hdr_t *hdr, unsigned UNUSED * flags_ptr) + H5B2_hdr_t *hdr, unsigned UNUSED * flags_ptr, + H5SL_t UNUSED *collective_write_list) { H5WB_t *wb = NULL; /* Wrapped buffer for header data */ uint8_t hdr_buf[H5B2_HDR_BUF_SIZE]; /* Buffer for header */ @@ -621,7 +622,7 @@ *------------------------------------------------------------------------- */ static herr_t -H5B2__cache_internal_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5B2_internal_t *internal, unsigned UNUSED * flags_ptr) +H5B2__cache_internal_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5B2_internal_t *internal, unsigned UNUSED * flags_ptr, H5SL_t UNUSED *collective_write_list) { herr_t ret_value = SUCCEED; /* Return value */ @@ -943,7 +944,7 @@ *------------------------------------------------------------------------- */ static herr_t -H5B2__cache_leaf_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5B2_leaf_t *leaf, unsigned UNUSED * flags_ptr) +H5B2__cache_leaf_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5B2_leaf_t *leaf, unsigned UNUSED * flags_ptr, H5SL_t UNUSED *collective_write_list) { herr_t ret_value = SUCCEED; /* Return value */ Index: src/H5Bcache.c =================================================================== --- src/H5Bcache.c (revision 25532) +++ src/H5Bcache.c (working copy) @@ -38,6 +38,7 @@ #include "H5Bpkg.h" /* B-link trees */ #include "H5Eprivate.h" /* Error handling */ #include "H5MFprivate.h" /* File memory management */ +#include "H5MMprivate.h" /* Memory management */ /****************/ @@ -56,7 +57,7 @@ /* Metadata cache callbacks */ static H5B_t *H5B__load(H5F_t *f, hid_t dxpl_id, haddr_t addr, void *udata); -static herr_t H5B__flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5B_t *b, unsigned UNUSED * flags_ptr); +static herr_t H5B__flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5B_t *b, unsigned UNUSED * flags_ptr, H5SL_t *collective_write_list); static herr_t H5B__dest(H5F_t *f, H5B_t *bt); static herr_t H5B__clear(H5F_t *f, H5B_t *b, hbool_t destroy); static herr_t H5B__compute_size(const H5F_t *f, const H5B_t *bt, size_t *size_ptr); @@ -206,9 +207,10 @@ *------------------------------------------------------------------------- */ static herr_t -H5B__flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5B_t *bt, unsigned UNUSED * flags_ptr) +H5B__flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5B_t *bt, unsigned UNUSED * flags_ptr, H5SL_t *collective_write_list) { H5B_shared_t *shared; /* Pointer to shared B-tree info */ + uint8_t *buf = NULL; /* Pointer to buffer */ herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_STATIC @@ -227,8 +229,11 @@ uint8_t *native; /* Pointer to native keys */ unsigned u; /* Local index variable */ - p = shared->page; + if(NULL == (buf = (uint8_t *)malloc(shared->sizeof_rnode))) + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, "unable to allocate buffer") + p = buf; + /* magic number */ HDmemcpy(p, H5B_MAGIC, (size_t)H5_SIZEOF_MAGIC); p += 4; @@ -263,22 +268,46 @@ HGOTO_ERROR(H5E_BTREE, H5E_CANTENCODE, FAIL, "unable to encode B-tree key") } /* end if */ - /* - * Write the disk page. We always write the header, but we don't - * bother writing data for the child entries that don't exist or - * for the final unchanged children. - */ - if(H5F_block_write(f, H5FD_MEM_BTREE, addr, shared->sizeof_rnode, dxpl_id, shared->page) < 0) - HGOTO_ERROR(H5E_BTREE, H5E_CANTFLUSH, FAIL, "unable to save B-tree node to disk") +#ifdef H5_HAVE_PARALLEL + if(collective_write_list) { + H5AC_collective_write_t *item = NULL; - bt->cache_info.is_dirty = FALSE; + if(NULL == (item = (H5AC_collective_write_t *)H5MM_malloc(sizeof(H5AC_collective_write_t)))) + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, "unable to allocate skip list item") + item->length = shared->sizeof_rnode; + item->free_buf = TRUE; + item->buf = buf; + item->offset = addr; + if(H5SL_insert(collective_write_list, item, &item->offset) < 0) { + H5MM_free(item); + HGOTO_ERROR(H5E_HEAP, H5E_CANTINSERT, FAIL, "unable to insert skip list item") + } /* end if */ + } /* end if */ + else +#endif /* H5_HAVE_PARALLEL */ + { + /* + * Write the disk page. We always write the header, but we don't + * bother writing data for the child entries that don't exist or + * for the final unchanged children. + */ + if(H5F_block_write(f, H5FD_MEM_BTREE, addr, shared->sizeof_rnode, dxpl_id, buf) < 0) + HGOTO_ERROR(H5E_BTREE, H5E_CANTFLUSH, FAIL, "unable to save B-tree node to disk") + } /* end block/else */ + + bt->cache_info.is_dirty = FALSE; } /* end if */ - if(destroy) + if(destroy) { + HDassert(!collective_write_list); if(H5B__dest(f, bt) < 0) HGOTO_ERROR(H5E_BTREE, H5E_CANTFREE, FAIL, "unable to destroy B-tree node") + } /* end if */ done: + if(buf && !(collective_write_list && (ret_value == SUCCEED))) + buf = (uint8_t *)H5MM_xfree(buf); + FUNC_LEAVE_NOAPI(ret_value) } /* end H5B__flush() */ Index: src/H5C.c =================================================================== --- src/H5C.c (revision 25532) +++ src/H5C.c (working copy) @@ -142,7 +142,8 @@ haddr_t addr, unsigned flags, hbool_t * first_flush_ptr, - hbool_t del_entry_from_slist_on_destroy); + hbool_t del_entry_from_slist_on_destroy, + H5SL_t * collective_write_list); static herr_t H5C_flush_invalidate_cache(H5F_t * f, hid_t primary_dxpl_id, @@ -212,11 +213,12 @@ void *udata); static herr_t H5C_epoch_marker_flush(H5F_t *f, hid_t dxpl_id, hbool_t dest, haddr_t addr, void *thing, - unsigned *flags_ptr); + unsigned *flags_ptr, H5SL_t UNUSED *collective_write_list); static herr_t H5C_epoch_marker_dest(H5F_t *f, void *thing); static herr_t H5C_epoch_marker_clear(H5F_t *f, void *thing, hbool_t dest); static herr_t H5C_epoch_marker_notify(H5C_notify_action_t action, void *thing); static herr_t H5C_epoch_marker_size(const H5F_t *f, const void *thing, size_t *size_ptr); +static herr_t H5C_collective_write_free(void *_item, void *key, void *op_data); const H5C_class_t epoch_marker_class = { @@ -261,7 +263,8 @@ hbool_t UNUSED dest, haddr_t UNUSED addr, void UNUSED *thing, - unsigned UNUSED * flags_ptr) + unsigned UNUSED * flags_ptr, + H5SL_t UNUSED *collective_write_list) { herr_t ret_value = FAIL; /* Return value */ @@ -336,6 +339,172 @@ } +#ifdef H5_HAVE_PARALLEL +static herr_t +H5C_collective_write(H5F_t *f, + hid_t primary_dxpl_id, + hid_t secondary_dxpl_id, + H5SL_t *collective_write_list, + hbool_t *first_flush_ptr) +{ + hid_t dxpl_id; + H5P_genplist_t *plist = NULL; + H5FD_mpio_xfer_t xfer_mode = H5FD_MPIO_COLLECTIVE_MD; + H5FD_mpio_xfer_t old_xfer_mode = H5FD_MPIO_COLLECTIVE_MD; + H5SL_node_t *node; + H5C_collective_write_t *item; + int count; + void *base_buf; + int *length_array = NULL; + MPI_Aint *buf_array = NULL; + MPI_Aint *offset_array = NULL; + MPI_Datatype btype; + MPI_Datatype ftype; + hbool_t btype_created = FALSE; + hbool_t ftype_created = FALSE; + int mpi_code; + int i; + herr_t ret_value = SUCCEED; + + FUNC_ENTER_NOAPI_NOINIT + + count = (int)H5SL_count(collective_write_list); + + if(count > 0) { + /* Handle first_flush_ptr/dxpl */ + dxpl_id = *first_flush_ptr ? primary_dxpl_id : secondary_dxpl_id; + *first_flush_ptr = FALSE; + + if(NULL == (plist = (H5P_genplist_t *)H5I_object(dxpl_id))) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a data transfer property list") + + old_xfer_mode = (H5FD_mpio_xfer_t)H5P_peek_unsigned(plist, H5D_XFER_IO_XFER_MODE_NAME); + + if(H5P_set(plist, H5D_XFER_IO_XFER_MODE_NAME, &xfer_mode) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI-I/O property") + + /* Allocate arrays */ + if(NULL == (length_array = (int *)H5MM_malloc((size_t)count * sizeof(int)))) + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, "memory allocation failed for collective write table length array") + if(NULL == (buf_array = (MPI_Aint *)H5MM_malloc((size_t)count * sizeof(MPI_Aint)))) + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, "memory allocation failed for collective buf table length array") + if(NULL == (offset_array = (MPI_Aint *)H5MM_malloc((size_t)count * sizeof(MPI_Aint)))) + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, "memory allocation failed for collective offset table length array") + + /* Fill arrays */ + node = H5SL_first(collective_write_list); + HDassert(node); + if(NULL == (item = (H5C_collective_write_t *)H5SL_item(node))) + HGOTO_ERROR(H5E_CACHE, H5E_NOTFOUND, FAIL, "can't retrieve skip list item") + + length_array[0] = (int)item->length; + base_buf = item->buf; + buf_array[0] = (MPI_Aint)0; + offset_array[0] = (MPI_Aint)item->offset; + + node = H5SL_next(node); + i = 1; + while(node) { + if(NULL == (item = (H5C_collective_write_t *)H5SL_item(node))) + HGOTO_ERROR(H5E_CACHE, H5E_NOTFOUND, FAIL, "can't retrieve skip list item") + + length_array[i] = (int)item->length; + buf_array[i] = (MPI_Aint)item->buf - (MPI_Aint)base_buf; + offset_array[i] = (MPI_Aint)item->offset; + + node = H5SL_next(node); + i++; + } /* end while */ + + /* Create memory mpi type */ + if(MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed(count, length_array, buf_array, MPI_BYTE, &btype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed failed", mpi_code) + btype_created = TRUE; + if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&btype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code) + + /* Create file mpi type */ + if(MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed(count, length_array, offset_array, MPI_BYTE, &ftype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed failed", mpi_code) + ftype_created = TRUE; + if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&ftype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code) + + /* Pass buf type, file type to the file driver. */ + if(H5FD_mpi_setup_collective(dxpl_id, &btype, &ftype) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI-I/O properties") + + /* Write data */ + if(H5F_block_write(f, H5FD_MEM_DEFAULT, (haddr_t)0, (size_t)1, dxpl_id, base_buf) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "unable to write entries collectively") + } /* end if */ + else { + MPI_Status mpi_stat; + MPI_File mpi_fh_p; + MPI_File mpi_fh; + + if(H5F_get_mpi_handle(f, (MPI_File **)&mpi_fh_p) < 0) + HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "can't get mpi file handle") + mpi_fh = *(MPI_File*)mpi_fh_p; + + /* just to match up with the 1st MPI_File_set_view from H5FD_mpio_write() */ + if(MPI_SUCCESS != (mpi_code = MPI_File_set_view(mpi_fh, (MPI_Offset)0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code) + + /* just to match up with MPI_File_write_at_all from H5FD_mpio_write() */ + HDmemset(&mpi_stat, 0, sizeof(MPI_Status)); + if(MPI_SUCCESS != (mpi_code = MPI_File_write_at_all(mpi_fh, 0, NULL, 0, MPI_BYTE, &mpi_stat))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at_all failed", mpi_code) + + /* just to match up with the 2nd MPI_File_set_view (reset) in H5FD_mpio_write() */ + if(MPI_SUCCESS != (mpi_code = MPI_File_set_view(mpi_fh, (MPI_Offset)0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code) + } /* end else */ + +done: + /* Free arrays */ + length_array = (int *)H5MM_xfree(length_array); + buf_array = (MPI_Aint *)H5MM_xfree(buf_array); + offset_array = (MPI_Aint *)H5MM_xfree(offset_array); + + /* Free MPI Types */ + if(btype_created && MPI_SUCCESS != (mpi_code = MPI_Type_free(&btype))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code) + if(ftype_created && MPI_SUCCESS != (mpi_code = MPI_Type_free(&ftype))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code) + + /* Reset dxpl */ + if(old_xfer_mode != (unsigned)H5FD_MPIO_COLLECTIVE_MD) { + HDassert(plist); + if(H5P_set(plist, H5D_XFER_IO_XFER_MODE_NAME, &old_xfer_mode) < 0) + HDONE_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI-I/O property") + } /* end if */ + + FUNC_LEAVE_NOAPI(ret_value); +} /* end H5C_collective_write() */ +#endif /* H5_HAVE_PARALLEL */ + + +#ifdef H5_HAVE_PARALLEL +static herr_t +H5C_collective_write_free(void *_item, void UNUSED *key, void UNUSED *op_data) +{ + H5C_collective_write_t *item = (H5C_collective_write_t *)_item; + + FUNC_ENTER_NOAPI_NOINIT_NOERR + + HDassert(item); + + if(item->free_buf) + item->buf = H5MM_xfree(item->buf); + /*!FIXME change to use free list for items */ + H5MM_free(item); + + FUNC_LEAVE_NOAPI(SUCCEED) +} /* end H5C_double_collective_table() */ +#endif /* H5_HAVE_PARALLEL */ + + /*------------------------------------------------------------------------- * Function: H5C_apply_candidate_list * @@ -440,12 +609,10 @@ int entries_to_clear = 0; int entries_to_flush = 0; int entries_to_flush_or_clear_last = 0; - int entries_to_flush_collectively = 0; int entries_cleared = 0; int entries_flushed = 0; int entries_delayed = 0; int entries_flushed_or_cleared_last = 0; - int entries_flushed_collectively = 0; int entries_examined = 0; int initial_list_len; int * candidate_assignment_table = NULL; @@ -460,6 +627,7 @@ #if H5C_APPLY_CANDIDATE_LIST__DEBUG char tbl_buf[1024]; #endif /* H5C_APPLY_CANDIDATE_LIST__DEBUG */ + H5SL_t *collective_write_list = NULL; herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_NOAPI(FAIL) @@ -549,6 +717,10 @@ HDfprintf(stdout, "%s:%d: marking entries.\n", FUNC, mpi_rank); #endif /* H5C_APPLY_CANDIDATE_LIST__DEBUG */ + /* Create skip list of entries for collective write */ + if(NULL == (collective_write_list = H5SL_create(H5SL_TYPE_HADDR, NULL))) + HGOTO_ERROR(H5E_DATASET, H5E_CANTCREATE, FAIL, "can't create skip list for entries") + for(i = 0; i < num_candidates; i++) { addr = candidates_list_ptr[i]; HDassert( H5F_addr_defined(addr) ); @@ -655,7 +827,8 @@ clear_ptr->addr, H5C__FLUSH_CLEAR_ONLY_FLAG, &first_flush, - TRUE) < 0) + TRUE, + NULL) < 0) HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't clear entry.") } /* end if */ @@ -679,7 +852,8 @@ flush_ptr->addr, H5C__NO_FLAGS_SET, &first_flush, - TRUE) < 0) + TRUE, + collective_write_list) < 0) HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't flush entry.") } /* end else-if */ @@ -739,11 +913,8 @@ intense rework of this function and potentially the function of candidate lists as a whole. */ - HDassert(entry_ptr->flush_me_collectively); entries_to_flush_or_clear_last++; - entries_to_flush_collectively++; HDassert(entries_to_flush_or_clear_last == 1); - HDassert(entries_to_flush_collectively == 1); /* Delay the entry. It will be flushed later. */ delayed_ptr = entry_ptr; @@ -772,7 +943,8 @@ clear_ptr->addr, H5C__FLUSH_CLEAR_ONLY_FLAG, &first_flush, - TRUE) < 0) + TRUE, + NULL) < 0) HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't clear entry.") } /* end else-if */ @@ -795,7 +967,8 @@ flush_ptr->addr, H5C__NO_FLAGS_SET, &first_flush, - TRUE) < 0) + TRUE, + collective_write_list) < 0) HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't flush entry.") } /* end else-if */ } /* end if */ @@ -831,26 +1004,47 @@ if (delayed_ptr->clear_on_unprotect) { entry_ptr->clear_on_unprotect = FALSE; entries_cleared++; + + if(H5C_flush_single_entry(f, + primary_dxpl_id, + secondary_dxpl_id, + delayed_ptr->type, + delayed_ptr->addr, + H5C__FLUSH_CLEAR_ONLY_FLAG, + &first_flush, + TRUE, + NULL) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, + "Can't flush entry.") } else if (delayed_ptr->flush_immediately) { entry_ptr->flush_immediately = FALSE; entries_flushed++; + + if(H5C_flush_single_entry(f, + primary_dxpl_id, + secondary_dxpl_id, + delayed_ptr->type, + delayed_ptr->addr, + H5C__NO_FLAGS_SET, + &first_flush, + TRUE, + collective_write_list) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, + "Can't flush entry.") } /* end if */ - if(H5C_flush_single_entry(f, - primary_dxpl_id, - secondary_dxpl_id, - delayed_ptr->type, - delayed_ptr->addr, - H5C__NO_FLAGS_SET, - &first_flush, - TRUE) < 0) - HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, - "Can't flush entry collectively.") - - entries_flushed_collectively++; entries_flushed_or_cleared_last++; } /* end if */ + /* Write collective list */ + if(H5C_collective_write(f, + primary_dxpl_id, + secondary_dxpl_id, + collective_write_list, + &first_flush) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, + "Can't write metadata collectively") + /* ====================================================================== * * Finished flushing everything. * * ====================================================================== */ @@ -858,17 +1052,18 @@ HDassert((entries_flushed == entries_to_flush)); HDassert((entries_cleared == entries_to_clear)); HDassert((entries_flushed_or_cleared_last == entries_to_flush_or_clear_last)); - HDassert((entries_flushed_collectively == entries_to_flush_collectively)); if((entries_flushed != entries_to_flush) || (entries_cleared != entries_to_clear) || - (entries_flushed_or_cleared_last != entries_to_flush_or_clear_last) || - (entries_flushed_collectively != entries_to_flush_collectively)) + (entries_flushed_or_cleared_last != entries_to_flush_or_clear_last)) HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "entry count mismatch.") done: if(candidate_assignment_table != NULL) candidate_assignment_table = (int *)H5MM_xfree((void *)candidate_assignment_table); + if(collective_write_list) + if(H5SL_destroy(collective_write_list, H5C_collective_write_free, NULL) < 0) + HDONE_ERROR(H5E_CACHE, H5E_CANTFREE, FAIL, "failed to destroy skip list") FUNC_LEAVE_NOAPI(ret_value) } /* H5C_apply_candidate_list() */ @@ -1661,7 +1856,8 @@ entry_ptr->addr, H5C__FLUSH_INVALIDATE_FLAG | H5C__FLUSH_CLEAR_ONLY_FLAG, &first_flush, - TRUE); + TRUE, + NULL); if ( result < 0 ) { @@ -1962,7 +2158,8 @@ entry_ptr->addr, flags, &first_flush, - FALSE); + FALSE, + NULL); if ( status < 0 ) { /* This shouldn't happen -- if it does, we are toast @@ -1995,7 +2192,8 @@ entry_ptr->addr, flags, &first_flush, - FALSE); + FALSE, + NULL); if ( status < 0 ) { /* This shouldn't happen -- if it does, we are @@ -3125,7 +3323,8 @@ addr, H5C__FLUSH_CLEAR_ONLY_FLAG, &first_flush, - TRUE) < 0 ) { + TRUE, + NULL) < 0 ) { HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't clear entry.") } @@ -3184,7 +3383,8 @@ clear_ptr->addr, H5C__FLUSH_CLEAR_ONLY_FLAG, &first_flush, - TRUE) < 0 ) { + TRUE, + NULL) < 0 ) { HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't clear entry.") } @@ -3221,7 +3421,8 @@ clear_ptr->addr, H5C__FLUSH_CLEAR_ONLY_FLAG, &first_flush, - TRUE) < 0 ) { + TRUE, + NULL) < 0 ) { HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't clear entry.") } @@ -5810,7 +6011,8 @@ addr, flush_flags, &dummy_first_flush, - TRUE) < 0 ) { + TRUE, + NULL) < 0 ) { HGOTO_ERROR(H5E_CACHE, H5E_CANTUNPROTECT, FAIL, "Can't flush.") } @@ -5847,7 +6049,8 @@ addr, H5C__FLUSH_CLEAR_ONLY_FLAG, &dummy_first_flush, - TRUE) < 0 ) { + TRUE, + NULL) < 0 ) { HGOTO_ERROR(H5E_CACHE, H5E_CANTUNPROTECT, FAIL, "Can't clear.") } @@ -7168,7 +7371,8 @@ entry_ptr->addr, H5C__NO_FLAGS_SET, first_flush_ptr, - FALSE); + FALSE, + NULL); } else { bytes_evicted += entry_ptr->size; @@ -7180,7 +7384,8 @@ entry_ptr->addr, H5C__FLUSH_INVALIDATE_FLAG, first_flush_ptr, - TRUE); + TRUE, + NULL); } if ( result < 0 ) { @@ -7278,7 +7483,8 @@ entry_ptr->addr, H5C__FLUSH_INVALIDATE_FLAG, first_flush_ptr, - TRUE); + TRUE, + NULL); if ( result < 0 ) { @@ -8025,7 +8231,8 @@ entry_ptr->addr, H5C__NO_FLAGS_SET, &first_flush, - FALSE); + FALSE, + NULL); if ( status < 0 ) { /* This shouldn't happen -- if it does, we are toast @@ -8051,7 +8258,8 @@ entry_ptr->addr, (cooked_flags | H5C__FLUSH_INVALIDATE_FLAG), &first_flush, - TRUE); + TRUE, + NULL); if ( status < 0 ) { /* This shouldn't happen -- if it does, we are toast so @@ -8145,7 +8353,8 @@ entry_ptr->addr, (cooked_flags | H5C__FLUSH_INVALIDATE_FLAG), &first_flush, - TRUE); + TRUE, + NULL); if ( status < 0 ) { /* This shouldn't happen -- if it does, we are toast so @@ -8329,7 +8538,8 @@ haddr_t addr, unsigned flags, hbool_t * first_flush_ptr, - hbool_t del_entry_from_slist_on_destroy) + hbool_t del_entry_from_slist_on_destroy, + H5SL_t * collective_write_list) { H5C_t * cache_ptr = f->shared->cache; hbool_t destroy; @@ -8553,7 +8763,7 @@ status = (entry_ptr->type->flush)(f, primary_dxpl_id, destroy_entry, entry_ptr->addr, entry_ptr, - &flush_flags); + &flush_flags, collective_write_list); *first_flush_ptr = FALSE; } else { @@ -8560,7 +8770,8 @@ status = (entry_ptr->type->flush)(f, secondary_dxpl_id, destroy_entry, entry_ptr->addr, - entry_ptr, &flush_flags); + entry_ptr, &flush_flags, + collective_write_list); } if ( status < 0 ) { @@ -9004,7 +9215,8 @@ entry_ptr->addr, H5C__NO_FLAGS_SET, first_flush_ptr, - FALSE); + FALSE, + NULL); } else if ( (cache_ptr->index_size + space_needed) > cache_ptr->max_cache_size ) { @@ -9019,7 +9231,8 @@ entry_ptr->addr, H5C__FLUSH_INVALIDATE_FLAG, first_flush_ptr, - TRUE); + TRUE, + NULL); } else { /* We have enough space so don't flush clean entry. @@ -9179,7 +9392,8 @@ entry_ptr->addr, H5C__FLUSH_INVALIDATE_FLAG, first_flush_ptr, - TRUE); + TRUE, + NULL); if ( result < 0 ) { Index: src/H5Cprivate.h =================================================================== --- src/H5Cprivate.h (revision 25532) +++ src/H5Cprivate.h (working copy) @@ -35,6 +35,7 @@ /* Private headers needed by this header */ #include "H5private.h" /* Generic Functions */ #include "H5Fprivate.h" /* File access */ +#include "H5SLprivate.h" /* Skip Lists */ #define H5C_DO_SANITY_CHECKS 0 @@ -144,7 +145,8 @@ hbool_t dest, haddr_t addr, void *thing, - unsigned * flags_ptr); + unsigned * flags_ptr, + H5SL_t *collective_write_list); typedef herr_t (*H5C_dest_func_t)(H5F_t *f, void *thing); typedef herr_t (*H5C_clear_func_t)(H5F_t *f, @@ -994,7 +996,16 @@ } H5C_auto_size_ctl_t; +#ifdef H5_HAVE_PARALLEL +typedef struct H5C_collective_write_t { + size_t length; + hbool_t free_buf; + void *buf; + haddr_t offset; +} H5C_collective_write_t; +#endif /* H5_HAVE_PARALLEL */ + /* * Library prototypes. */ Index: src/H5EAcache.c =================================================================== --- src/H5EAcache.c (revision 25532) +++ src/H5EAcache.c (working copy) @@ -81,30 +81,30 @@ /* Metadata cache (H5AC) callbacks */ static H5EA_hdr_t *H5EA__cache_hdr_load(H5F_t *f, hid_t dxpl_id, haddr_t addr, void *udata); -static herr_t H5EA__cache_hdr_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5EA_hdr_t *hdr, unsigned * flags_ptr); +static herr_t H5EA__cache_hdr_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5EA_hdr_t *hdr, unsigned * flags_ptr, H5SL_t *collective_write_list); static herr_t H5EA__cache_hdr_clear(H5F_t *f, H5EA_hdr_t *hdr, hbool_t destroy); static herr_t H5EA__cache_hdr_size(const H5F_t *f, const H5EA_hdr_t *hdr, size_t *size_ptr); static herr_t H5EA__cache_hdr_dest(H5F_t *f, H5EA_hdr_t *hdr); static H5EA_iblock_t *H5EA__cache_iblock_load(H5F_t *f, hid_t dxpl_id, haddr_t addr, void *udata); -static herr_t H5EA__cache_iblock_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5EA_iblock_t *iblock, unsigned * flags_ptr); +static herr_t H5EA__cache_iblock_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5EA_iblock_t *iblock, unsigned * flags_ptr, H5SL_t *collective_write_list); static herr_t H5EA__cache_iblock_clear(H5F_t *f, H5EA_iblock_t *iblock, hbool_t destroy); static herr_t H5EA__cache_iblock_notify(H5AC_notify_action_t action, H5EA_iblock_t *iblock); static herr_t H5EA__cache_iblock_size(const H5F_t *f, const H5EA_iblock_t *iblock, size_t *size_ptr); static herr_t H5EA__cache_iblock_dest(H5F_t *f, H5EA_iblock_t *iblock); static H5EA_sblock_t *H5EA__cache_sblock_load(H5F_t *f, hid_t dxpl_id, haddr_t addr, void *udata); -static herr_t H5EA__cache_sblock_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5EA_sblock_t *sblock, unsigned * flags_ptr); +static herr_t H5EA__cache_sblock_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5EA_sblock_t *sblock, unsigned * flags_ptr, H5SL_t *collective_write_list); static herr_t H5EA__cache_sblock_clear(H5F_t *f, H5EA_sblock_t *sblock, hbool_t destroy); static herr_t H5EA__cache_sblock_size(const H5F_t *f, const H5EA_sblock_t *sblock, size_t *size_ptr); static herr_t H5EA__cache_sblock_notify(H5AC_notify_action_t action, H5EA_sblock_t *sblock); static herr_t H5EA__cache_sblock_dest(H5F_t *f, H5EA_sblock_t *sblock); static H5EA_dblock_t *H5EA__cache_dblock_load(H5F_t *f, hid_t dxpl_id, haddr_t addr, void *udata); -static herr_t H5EA__cache_dblock_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5EA_dblock_t *dblock, unsigned * flags_ptr); +static herr_t H5EA__cache_dblock_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5EA_dblock_t *dblock, unsigned * flags_ptr, H5SL_t *collective_write_list); static herr_t H5EA__cache_dblock_clear(H5F_t *f, H5EA_dblock_t *dblock, hbool_t destroy); static herr_t H5EA__cache_dblock_size(const H5F_t *f, const H5EA_dblock_t *dblock, size_t *size_ptr); static herr_t H5EA__cache_dblock_notify(H5AC_notify_action_t action, H5EA_dblock_t *dblock); static herr_t H5EA__cache_dblock_dest(H5F_t *f, H5EA_dblock_t *dblock); static H5EA_dblk_page_t *H5EA__cache_dblk_page_load(H5F_t *f, hid_t dxpl_id, haddr_t addr, void *udata); -static herr_t H5EA__cache_dblk_page_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5EA_dblk_page_t *dblk_page, unsigned * flags_ptr); +static herr_t H5EA__cache_dblk_page_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5EA_dblk_page_t *dblk_page, unsigned * flags_ptr, H5SL_t *collective_write_list); static herr_t H5EA__cache_dblk_page_clear(H5F_t *f, H5EA_dblk_page_t *dblk_page, hbool_t destroy); static herr_t H5EA__cache_dblk_page_size(const H5F_t *f, const H5EA_dblk_page_t *dblk_page, size_t *size_ptr); static herr_t H5EA__cache_dblk_page_notify(H5AC_notify_action_t action, H5EA_dblk_page_t *dblk_page); @@ -350,7 +350,8 @@ BEGIN_FUNC(STATIC, ERR, herr_t, SUCCEED, FAIL, H5EA__cache_hdr_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, - H5EA_hdr_t *hdr, unsigned UNUSED * flags_ptr)) + H5EA_hdr_t *hdr, unsigned UNUSED * flags_ptr, + H5SL_t UNUSED *collective_write_list)) H5WB_t *wb = NULL; /* Wrapped buffer for header data */ uint8_t hdr_buf[H5EA_HDR_BUF_SIZE]; /* Buffer for header */ @@ -705,7 +706,8 @@ BEGIN_FUNC(STATIC, ERR, herr_t, SUCCEED, FAIL, H5EA__cache_iblock_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, - H5EA_iblock_t *iblock, unsigned UNUSED * flags_ptr)) + H5EA_iblock_t *iblock, unsigned UNUSED * flags_ptr, + H5SL_t UNUSED *collective_write_list)) /* Local variables */ H5WB_t *wb = NULL; /* Wrapped buffer for serializing data */ @@ -1111,7 +1113,8 @@ BEGIN_FUNC(STATIC, ERR, herr_t, SUCCEED, FAIL, H5EA__cache_sblock_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, - H5EA_sblock_t *sblock, unsigned UNUSED * flags_ptr)) + H5EA_sblock_t *sblock, unsigned UNUSED * flags_ptr, + H5SL_t UNUSED *collective_write_list)) /* Local variables */ H5WB_t *wb = NULL; /* Wrapped buffer for serializing data */ @@ -1505,7 +1508,8 @@ BEGIN_FUNC(STATIC, ERR, herr_t, SUCCEED, FAIL, H5EA__cache_dblock_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, - H5EA_dblock_t *dblock, unsigned UNUSED * flags_ptr)) + H5EA_dblock_t *dblock, unsigned UNUSED * flags_ptr, + H5SL_t UNUSED *collective_write_list)) /* Local variables */ H5WB_t *wb = NULL; /* Wrapped buffer for serializing data */ @@ -1879,7 +1883,8 @@ BEGIN_FUNC(STATIC, ERR, herr_t, SUCCEED, FAIL, H5EA__cache_dblk_page_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, - H5EA_dblk_page_t *dblk_page, unsigned UNUSED * flags_ptr)) + H5EA_dblk_page_t *dblk_page, unsigned UNUSED * flags_ptr, + H5SL_t UNUSED *collective_write_list)) /* Local variables */ H5WB_t *wb = NULL; /* Wrapped buffer for serializing data */ Index: src/H5FAcache.c =================================================================== --- src/H5FAcache.c (revision 25532) +++ src/H5FAcache.c (working copy) @@ -75,19 +75,19 @@ /* Metadata cache (H5AC) callbacks */ static H5FA_hdr_t *H5FA__cache_hdr_load(H5F_t *f, hid_t dxpl_id, haddr_t addr, void *udata); -static herr_t H5FA__cache_hdr_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5FA_hdr_t *hdr, unsigned * flags_ptr); +static herr_t H5FA__cache_hdr_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5FA_hdr_t *hdr, unsigned * flags_ptr, H5SL_t *collective_write_list); static herr_t H5FA__cache_hdr_clear(H5F_t *f, H5FA_hdr_t *hdr, hbool_t destroy); static herr_t H5FA__cache_hdr_size(const H5F_t *f, const H5FA_hdr_t *hdr, size_t *size_ptr); static herr_t H5FA__cache_hdr_dest(H5F_t *f, H5FA_hdr_t *hdr); static H5FA_dblock_t *H5FA__cache_dblock_load(H5F_t *f, hid_t dxpl_id, haddr_t addr, void *udata); -static herr_t H5FA__cache_dblock_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5FA_dblock_t *dblock, unsigned * flags_ptr); +static herr_t H5FA__cache_dblock_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5FA_dblock_t *dblock, unsigned * flags_ptr, H5SL_t *collective_write_list); static herr_t H5FA__cache_dblock_clear(H5F_t *f, H5FA_dblock_t *dblock, hbool_t destroy); static herr_t H5FA__cache_dblock_size(const H5F_t *f, const H5FA_dblock_t *dblock, size_t *size_ptr); static herr_t H5FA__cache_dblock_dest(H5F_t *f, H5FA_dblock_t *dblock); static H5FA_dblk_page_t *H5FA__cache_dblk_page_load(H5F_t *f, hid_t dxpl_id, haddr_t addr, void *udata); -static herr_t H5FA__cache_dblk_page_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5FA_dblk_page_t *dblk_page, unsigned * flags_ptr); +static herr_t H5FA__cache_dblk_page_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5FA_dblk_page_t *dblk_page, unsigned * flags_ptr, H5SL_t *collective_write_list); static herr_t H5FA__cache_dblk_page_clear(H5F_t *f, H5FA_dblk_page_t *dblk_page, hbool_t destroy); static herr_t H5FA__cache_dblk_page_size(const H5F_t *f, const H5FA_dblk_page_t *dblk_page, size_t *size_ptr); static herr_t H5FA__cache_dblk_page_dest(H5F_t *f, H5FA_dblk_page_t *dblk_page); @@ -299,7 +299,8 @@ BEGIN_FUNC(STATIC, ERR, herr_t, SUCCEED, FAIL, H5FA__cache_hdr_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, - H5FA_hdr_t *hdr, unsigned UNUSED * flags_ptr)) + H5FA_hdr_t *hdr, unsigned UNUSED * flags_ptr, + H5SL_t UNUSED *collective_write_list)) H5WB_t *wb = NULL; /* Wrapped buffer for header data */ uint8_t hdr_buf[H5FA_HDR_BUF_SIZE]; /* Buffer for header */ @@ -630,7 +631,8 @@ BEGIN_FUNC(STATIC, ERR, herr_t, SUCCEED, FAIL, H5FA__cache_dblock_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, - H5FA_dblock_t *dblock, unsigned UNUSED * flags_ptr)) + H5FA_dblock_t *dblock, unsigned UNUSED * flags_ptr, + H5SL_t UNUSED *collective_write_list)) /* Local variables */ H5WB_t *wb = NULL; /* Wrapped buffer for serializing data */ @@ -955,7 +957,8 @@ BEGIN_FUNC(STATIC, ERR, herr_t, SUCCEED, FAIL, H5FA__cache_dblk_page_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, - H5FA_dblk_page_t *dblk_page, unsigned UNUSED * flags_ptr)) + H5FA_dblk_page_t *dblk_page, unsigned UNUSED * flags_ptr, + H5SL_t UNUSED *collective_write_list)) /* Local variables */ H5WB_t *wb = NULL; /* Wrapped buffer for serializing data */ Index: src/H5FDmpi.h =================================================================== --- src/H5FDmpi.h (revision 25532) +++ src/H5FDmpi.h (working copy) @@ -39,7 +39,8 @@ /* Type of I/O for data transfer properties */ typedef enum H5FD_mpio_xfer_t { H5FD_MPIO_INDEPENDENT = 0, /*zero is the default*/ - H5FD_MPIO_COLLECTIVE + H5FD_MPIO_COLLECTIVE, + H5FD_MPIO_COLLECTIVE_MD } H5FD_mpio_xfer_t; /* Type of chunked dataset I/O */ Index: src/H5FDmpio.c =================================================================== --- src/H5FDmpio.c (revision 25532) +++ src/H5FDmpio.c (working copy) @@ -1743,6 +1743,7 @@ int type_size; /* MPI datatype used for I/O's size */ int io_size; /* Actual number of bytes requested */ hbool_t use_view_this_time = FALSE; + H5FD_mpio_xfer_t xfer_mode; /* I/O tranfer mode */ H5P_genplist_t *plist = NULL; /* Property list pointer */ herr_t ret_value = SUCCEED; @@ -1774,16 +1775,14 @@ fprintf(stdout, "in H5FD_mpio_write mpi_off=%ld size_i=%d\n", (long)mpi_off, size_i); #endif - if(type == H5FD_MEM_DRAW) { - H5FD_mpio_xfer_t xfer_mode; /* I/O tranfer mode */ + /* Obtain the data transfer properties */ + if(NULL == (plist = (H5P_genplist_t *)H5I_object(dxpl_id))) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list") - /* Obtain the data transfer properties */ - if(NULL == (plist = (H5P_genplist_t *)H5I_object(dxpl_id))) - HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list") + /* Obtain the data transfer properties */ + xfer_mode = (H5FD_mpio_xfer_t)H5P_peek_unsigned(plist, H5D_XFER_IO_XFER_MODE_NAME); - /* Obtain the data transfer properties */ - xfer_mode = (H5FD_mpio_xfer_t)H5P_peek_unsigned(plist, H5D_XFER_IO_XFER_MODE_NAME); - + if((type == H5FD_MEM_DRAW) || (xfer_mode == H5FD_MPIO_COLLECTIVE_MD)) { /* * Set up for a fancy xfer using complex types, or single byte block. We * wouldn't need to rely on the use_view field if MPI semantics allowed @@ -1790,7 +1789,8 @@ * us to test that btype=ftype=MPI_BYTE (or even MPI_TYPE_NULL, which * could mean "use MPI_BYTE" by convention). */ - if(xfer_mode == H5FD_MPIO_COLLECTIVE) { + if((xfer_mode == H5FD_MPIO_COLLECTIVE) + || (xfer_mode == H5FD_MPIO_COLLECTIVE_MD)) { MPI_Datatype file_type; /* Remember that views are used */ @@ -1815,17 +1815,6 @@ mpi_off = 0; } /* end if */ } /* end if */ - else { -#if 0 /* JRM -- 3/23/10 */ /* this is no longer always the case */ - /* Only one process can do the actual metadata write */ - if(file->mpi_rank != H5_PAR_META_WRITE) -#ifdef LATER - HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "can't write metadata from non-zero rank") -#else /* LATER */ - HGOTO_DONE(SUCCEED) /* skip the actual write */ -#endif /* LATER */ -#endif /* JRM */ - } /* end if */ /* Write the data. */ if(use_view_this_time) { @@ -1835,12 +1824,15 @@ if(H5FD_mpio_Debug[(int)'t']) fprintf(stdout, "H5FD_mpio_write: using MPIO collective mode\n"); #endif - /* Peek the collective_opt property to check whether the application wants to do IO individually. */ - HDassert(plist); - coll_opt_mode = (H5FD_mpio_collective_opt_t)H5P_peek_unsigned(plist, H5D_XFER_MPIO_COLLECTIVE_OPT_NAME); + if(xfer_mode != H5FD_MPIO_COLLECTIVE_MD) { + /* Peek the collective_opt property to check whether the application wants to do IO individually. */ + HDassert(plist); + coll_opt_mode = (H5FD_mpio_collective_opt_t)H5P_peek_unsigned(plist, H5D_XFER_MPIO_COLLECTIVE_OPT_NAME); + } /* end if */ /*OKAY: CAST DISCARDS CONST QUALIFIER*/ - if(coll_opt_mode == H5FD_MPIO_COLLECTIVE_IO) { + if((coll_opt_mode == H5FD_MPIO_COLLECTIVE_IO) + || (xfer_mode == H5FD_MPIO_COLLECTIVE_MD)) { #ifdef H5FDmpio_DEBUG if(H5FD_mpio_Debug[(int)'t']) fprintf(stdout, "H5FD_mpio_write: doing MPI collective IO\n"); Index: src/H5FScache.c =================================================================== --- src/H5FScache.c (revision 25532) +++ src/H5FScache.c (working copy) @@ -80,12 +80,12 @@ /* Metadata cache callbacks */ static H5FS_t *H5FS_cache_hdr_load(H5F_t *f, hid_t dxpl_id, haddr_t addr, void *udata); -static herr_t H5FS_cache_hdr_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5FS_t *fspace, unsigned UNUSED * flags_ptr); +static herr_t H5FS_cache_hdr_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5FS_t *fspace, unsigned UNUSED * flags_ptr, H5SL_t *collective_write_list); static herr_t H5FS_cache_hdr_dest(H5F_t *f, H5FS_t *fspace); static herr_t H5FS_cache_hdr_clear(H5F_t *f, H5FS_t *fspace, hbool_t destroy); static herr_t H5FS_cache_hdr_size(const H5F_t *f, const H5FS_t *fspace, size_t *size_ptr); static H5FS_sinfo_t *H5FS_cache_sinfo_load(H5F_t *f, hid_t dxpl_id, haddr_t addr, void *udata); -static herr_t H5FS_cache_sinfo_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5FS_sinfo_t *sinfo, unsigned UNUSED * flags_ptr); +static herr_t H5FS_cache_sinfo_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5FS_sinfo_t *sinfo, unsigned UNUSED * flags_ptr, H5SL_t *collective_write_list); static herr_t H5FS_cache_sinfo_dest(H5F_t *f, H5FS_sinfo_t *sinfo); static herr_t H5FS_cache_sinfo_clear(H5F_t *f, H5FS_sinfo_t *sinfo, hbool_t destroy); static herr_t H5FS_cache_sinfo_size(const H5F_t *f, const H5FS_sinfo_t *sinfo, size_t *size_ptr); @@ -281,7 +281,7 @@ *------------------------------------------------------------------------- */ static herr_t -H5FS_cache_hdr_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5FS_t *fspace, unsigned UNUSED * flags_ptr) +H5FS_cache_hdr_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5FS_t *fspace, unsigned UNUSED * flags_ptr, H5SL_t *collective_write_list) { H5WB_t *wb = NULL; /* Wrapped buffer for header data */ uint8_t hdr_buf[H5FS_HDR_BUF_SIZE]; /* Buffer for header */ @@ -321,7 +321,7 @@ } /* end if */ /* Write section info to file */ - if(H5FS_cache_sinfo_flush(f, dxpl_id, FALSE, fspace->sect_addr, fspace->sinfo, NULL) < 0) + if(H5FS_cache_sinfo_flush(f, dxpl_id, FALSE, fspace->sect_addr, fspace->sinfo, NULL, collective_write_list) < 0) HGOTO_ERROR(H5E_FSPACE, H5E_CANTFLUSH, FAIL, "unable to save free space section info to disk") } /* end if */ @@ -819,7 +819,7 @@ *------------------------------------------------------------------------- */ static herr_t -H5FS_cache_sinfo_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5FS_sinfo_t *sinfo, unsigned UNUSED * flags_ptr) +H5FS_cache_sinfo_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5FS_sinfo_t *sinfo, unsigned UNUSED * flags_ptr, H5SL_t UNUSED *collective_write_list) { herr_t ret_value = SUCCEED; /* Return value */ Index: src/H5Fmpi.c =================================================================== --- src/H5Fmpi.c (revision 25532) +++ src/H5Fmpi.c (working copy) @@ -79,6 +79,39 @@ #ifdef H5_HAVE_PARALLEL /*------------------------------------------------------------------------- + * Function: H5F_get_mpi_handle + * + * Purpose: Retrieves MPI File handle. + * + * Return: Success: The size (positive) + * Failure: Negative + * + * Programmer: Jonathan Kim + * June 5, 2013 + * + * Modifications: + *------------------------------------------------------------------------- + */ +herr_t +H5F_get_mpi_handle(const H5F_t *f, MPI_File **f_handle) +{ + herr_t ret_value = SUCCEED; + hid_t fapl=-1; + + FUNC_ENTER_NOAPI(FAIL) + + assert(f && f->shared); + + /* Dispatch to driver */ + if ((ret_value=H5FD_get_vfd_handle(f->shared->lf, fapl, f_handle)) < 0) + HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "can't get mpi file handle") + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5F_get_mpi_handle() */ + + +/*------------------------------------------------------------------------- * Function: H5F_mpi_get_rank * * Purpose: Retrieves the rank of an MPI process. Index: src/H5Fprivate.h =================================================================== --- src/H5Fprivate.h (revision 25532) +++ src/H5Fprivate.h (working copy) @@ -675,6 +675,7 @@ /* Parallel I/O (i.e. MPI) related routines */ #ifdef H5_HAVE_PARALLEL +H5_DLL herr_t H5F_get_mpi_handle(const H5F_t *f, MPI_File **f_handle); H5_DLL int H5F_mpi_get_rank(const H5F_t *f); H5_DLL MPI_Comm H5F_mpi_get_comm(const H5F_t *f); H5_DLL int H5F_mpi_get_size(const H5F_t *f); Index: src/H5Fsuper_cache.c =================================================================== --- src/H5Fsuper_cache.c (revision 25532) +++ src/H5Fsuper_cache.c (working copy) @@ -60,7 +60,7 @@ /* Metadata cache (H5AC) callbacks */ static H5F_super_t *H5F_sblock_load(H5F_t *f, hid_t dxpl_id, haddr_t addr, void *udata); -static herr_t H5F_sblock_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5F_super_t *sblock); +static herr_t H5F_sblock_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5F_super_t *sblock, unsigned * flags_ptr, H5SL_t *collective_write_list); static herr_t H5F_sblock_dest(H5F_t *f, H5F_super_t * sblock); static herr_t H5F_sblock_clear(H5F_t *f, H5F_super_t *sblock, hbool_t destroy); static herr_t H5F_sblock_size(const H5F_t *f, const H5F_super_t *sblock, size_t *size_ptr); @@ -639,8 +639,10 @@ */ static herr_t H5F_sblock_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t UNUSED addr, - H5F_super_t *sblock) + H5F_super_t *sblock, unsigned UNUSED * flags_ptr, + H5SL_t *collective_write_list) { + uint8_t *buf = NULL; /* Superblock & driver info blockencoding buffer */ herr_t ret_value = SUCCEED; FUNC_ENTER_NOAPI_NOINIT @@ -661,12 +663,14 @@ if(sblock->cache_info.is_dirty) { H5P_genplist_t *dxpl; /* DXPL object */ - uint8_t buf[H5F_MAX_SUPERBLOCK_SIZE + H5F_MAX_DRVINFOBLOCK_SIZE]; /* Superblock & driver info blockencoding buffer */ uint8_t *p; /* Ptr into encoding buffer */ haddr_t rel_eoa; /* Relative EOA for file */ size_t superblock_size; /* Size of superblock, in bytes */ size_t driver_size; /* Size of driver info block (bytes)*/ + if(NULL == (buf = (uint8_t *)H5MM_malloc(H5F_MAX_SUPERBLOCK_SIZE + H5F_MAX_DRVINFOBLOCK_SIZE))) + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, "unable to allocate buffer") + /* Encode the common portion of the file superblock for all versions */ p = buf; HDmemcpy(p, H5F_SIGNATURE, (size_t)H5F_SIGNATURE_LEN); @@ -787,17 +791,36 @@ H5_ASSIGN_OVERFLOW(superblock_size, (p - buf), ptrdiff_t, size_t); /* Double check we didn't overrun the block (unlikely) */ - HDassert(superblock_size <= sizeof(buf)); + HDassert(superblock_size <= (H5F_MAX_SUPERBLOCK_SIZE + H5F_MAX_DRVINFOBLOCK_SIZE)); /* Get the DXPL plist object for DXPL ID */ if(NULL == (dxpl = (H5P_genplist_t *)H5I_object(dxpl_id))) HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "can't get property list") - /* Write superblock */ - /* (always at relative address 0) */ - if(H5FD_write(f->shared->lf, dxpl, H5FD_MEM_SUPER, (haddr_t)0, superblock_size, buf) < 0) - HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "unable to write superblock") +#ifdef H5_HAVE_PARALLEL + if(collective_write_list) { + H5AC_collective_write_t *item = NULL; + if(NULL == (item = (H5AC_collective_write_t *)H5MM_malloc(sizeof(H5AC_collective_write_t)))) + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, "unable to allocate skip list item") + item->length = superblock_size; + item->free_buf = TRUE; + item->buf = buf; + item->offset = (haddr_t)0; + if(H5SL_insert(collective_write_list, item, &item->offset) < 0) { + H5MM_free(item); + HGOTO_ERROR(H5E_HEAP, H5E_CANTINSERT, FAIL, "unable to insert skip list item") + } /* end if */ + } /* end if */ + else +#endif /* H5_HAVE_PARALLEL */ + { + /* Write superblock */ + /* (always at relative address 0) */ + if(H5FD_write(f->shared->lf, dxpl, H5FD_MEM_SUPER, (haddr_t)0, superblock_size, buf) < 0) + HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "unable to write superblock") + } /* end block/else */ + /* Check for newer version of superblock format & superblock extension */ if(sblock->super_vers >= HDF5_SUPERBLOCK_VERSION_2 && H5F_addr_defined(sblock->ext_addr)) { H5O_loc_t ext_loc; /* "Object location" for superblock extension */ @@ -845,6 +868,9 @@ HGOTO_ERROR(H5E_FSPACE, H5E_CLOSEERROR, FAIL, "can't close superblock") done: + if(buf && !(collective_write_list && (ret_value == SUCCEED))) + buf = (uint8_t *)H5MM_xfree(buf); + FUNC_LEAVE_NOAPI(ret_value) } /* end H5F_sblock_flush() */ Index: src/H5Gcache.c =================================================================== --- src/H5Gcache.c (revision 25532) +++ src/H5Gcache.c (working copy) @@ -38,6 +38,7 @@ #include "H5Eprivate.h" /* Error handling */ #include "H5Gpkg.h" /* Groups */ #include "H5MFprivate.h" /* File memory management */ +#include "H5MMprivate.h" /* Memory management */ #include "H5WBprivate.h" /* Wrapped Buffers */ @@ -66,7 +67,7 @@ /* Metadata cache (H5AC) callbacks */ static H5G_node_t *H5G_node_load(H5F_t *f, hid_t dxpl_id, haddr_t addr, void *udata); static herr_t H5G_node_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, - H5G_node_t *sym, unsigned *flags_ptr); + H5G_node_t *sym, unsigned *flags_ptr, H5SL_t *collective_write_list); static herr_t H5G_node_dest(H5F_t *f, H5G_node_t *sym); static herr_t H5G_node_clear(H5F_t *f, H5G_node_t *sym, hbool_t destroy); static herr_t H5G_node_size(const H5F_t *f, const H5G_node_t *sym, size_t *size_ptr); @@ -213,10 +214,9 @@ *------------------------------------------------------------------------- */ static herr_t -H5G_node_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5G_node_t *sym, unsigned UNUSED * flags_ptr) +H5G_node_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5G_node_t *sym, unsigned UNUSED * flags_ptr, H5SL_t *collective_write_list) { - H5WB_t *wb = NULL; /* Wrapped buffer for node data */ - uint8_t node_buf[H5G_NODE_BUF_SIZE]; /* Buffer for node */ + uint8_t *node = NULL; /* Superblock & driver info blockencoding buffer */ herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_NOAPI_NOINIT @@ -232,17 +232,11 @@ * Write the symbol node to disk. */ if(sym->cache_info.is_dirty) { - uint8_t *node; /* Pointer to node buffer */ uint8_t *p; /* Pointer into raw data buffer */ - /* Wrap the local buffer for serialized node info */ - if(NULL == (wb = H5WB_wrap(node_buf, sizeof(node_buf)))) - HGOTO_ERROR(H5E_SYM, H5E_CANTINIT, FAIL, "can't wrap buffer") + if(NULL == (node = (uint8_t *)H5MM_malloc(sym->node_size))) + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, "unable to allocate buffer") - /* Get a pointer to a buffer that's large enough for node */ - if(NULL == (node = (uint8_t *)H5WB_actual(wb, sym->node_size))) - HGOTO_ERROR(H5E_SYM, H5E_NOSPACE, FAIL, "can't get actual buffer") - /* Get temporary pointer to serialized symbol table node */ p = node; @@ -264,10 +258,29 @@ HGOTO_ERROR(H5E_SYM, H5E_CANTENCODE, FAIL, "can't serialize") HDmemset(p, 0, sym->node_size - (size_t)(p - node)); - /* Write the serialized symbol table node. */ - if(H5F_block_write(f, H5FD_MEM_BTREE, addr, sym->node_size, dxpl_id, node) < 0) - HGOTO_ERROR(H5E_SYM, H5E_WRITEERROR, FAIL, "unable to write symbol table node to the file") +#ifdef H5_HAVE_PARALLEL + if(collective_write_list) { + H5AC_collective_write_t *item = NULL; + if(NULL == (item = (H5AC_collective_write_t *)H5MM_malloc(sizeof(H5AC_collective_write_t)))) + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, "unable to allocate skip list item") + item->length = sym->node_size; + item->free_buf = TRUE; + item->buf = node; + item->offset = addr; + if(H5SL_insert(collective_write_list, item, &item->offset) < 0) { + H5MM_free(item); + HGOTO_ERROR(H5E_HEAP, H5E_CANTINSERT, FAIL, "unable to insert skip list item") + } /* end if */ + } /* end if */ + else +#endif /* H5_HAVE_PARALLEL */ + { + /* Write the serialized symbol table node. */ + if(H5F_block_write(f, H5FD_MEM_BTREE, addr, sym->node_size, dxpl_id, node) < 0) + HGOTO_ERROR(H5E_SYM, H5E_WRITEERROR, FAIL, "unable to write symbol table node to the file") + } /* end block/else */ + /* Reset the node's dirty flag */ sym->cache_info.is_dirty = FALSE; } /* end if */ @@ -281,9 +294,8 @@ HGOTO_ERROR(H5E_SYM, H5E_CANTFREE, FAIL, "unable to destroy symbol table node") done: - /* Release resources */ - if(wb && H5WB_unwrap(wb) < 0) - HDONE_ERROR(H5E_SYM, H5E_CLOSEERROR, FAIL, "can't close wrapped buffer") + if(node && !(collective_write_list && (ret_value == SUCCEED))) + node = (uint8_t *)H5MM_xfree(node); FUNC_LEAVE_NOAPI(ret_value) } /* end H5G_node_flush() */ Index: src/H5HFcache.c =================================================================== --- src/H5HFcache.c (revision 25532) +++ src/H5HFcache.c (working copy) @@ -80,18 +80,18 @@ /* Metadata cache (H5AC) callbacks */ static H5HF_hdr_t *H5HF_cache_hdr_load(H5F_t *f, hid_t dxpl_id, haddr_t addr, void *udata); -static herr_t H5HF_cache_hdr_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5HF_hdr_t *hdr, unsigned UNUSED * flags_ptr); +static herr_t H5HF_cache_hdr_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5HF_hdr_t *hdr, unsigned UNUSED * flags_ptr, H5SL_t *collective_write_list); static herr_t H5HF_cache_hdr_dest(H5F_t *f, H5HF_hdr_t *hdr); static herr_t H5HF_cache_hdr_clear(H5F_t *f, H5HF_hdr_t *hdr, hbool_t destroy); static herr_t H5HF_cache_hdr_size(const H5F_t *f, const H5HF_hdr_t *hdr, size_t *size_ptr); static H5HF_indirect_t *H5HF_cache_iblock_load(H5F_t *f, hid_t dxpl_id, haddr_t addr, void *udata); -static herr_t H5HF_cache_iblock_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5HF_indirect_t *iblock, unsigned UNUSED * flags_ptr); +static herr_t H5HF_cache_iblock_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5HF_indirect_t *iblock, unsigned UNUSED * flags_ptr, H5SL_t *collective_write_list); static herr_t H5HF_cache_iblock_dest(H5F_t *f, H5HF_indirect_t *iblock); static herr_t H5HF_cache_iblock_clear(H5F_t *f, H5HF_indirect_t *iblock, hbool_t destroy); static herr_t H5HF_cache_iblock_notify(H5C_notify_action_t action, H5HF_indirect_t *iblock); static herr_t H5HF_cache_iblock_size(const H5F_t *f, const H5HF_indirect_t *iblock, size_t *size_ptr); static H5HF_direct_t *H5HF_cache_dblock_load(H5F_t *f, hid_t dxpl_id, haddr_t addr, void *udata); -static herr_t H5HF_cache_dblock_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5HF_direct_t *dblock, unsigned UNUSED * flags_ptr); +static herr_t H5HF_cache_dblock_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5HF_direct_t *dblock, unsigned UNUSED * flags_ptr, H5SL_t *collective_write_list); static herr_t H5HF_cache_dblock_dest(H5F_t *f, H5HF_direct_t *dblock); static herr_t H5HF_cache_dblock_clear(H5F_t *f, H5HF_direct_t *dblock, hbool_t destroy); static herr_t H5HF_cache_dblock_notify(H5C_notify_action_t action, H5HF_direct_t *dblock); @@ -478,7 +478,7 @@ *------------------------------------------------------------------------- */ static herr_t -H5HF_cache_hdr_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5HF_hdr_t *hdr, unsigned UNUSED * flags_ptr) +H5HF_cache_hdr_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5HF_hdr_t *hdr, unsigned UNUSED * flags_ptr, H5SL_t UNUSED *collective_write_list) { H5WB_t *wb = NULL; /* Wrapped buffer for header data */ uint8_t hdr_buf[H5HF_HDR_BUF_SIZE]; /* Buffer for header */ @@ -961,7 +961,7 @@ *------------------------------------------------------------------------- */ static herr_t -H5HF_cache_iblock_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5HF_indirect_t *iblock, unsigned UNUSED * flags_ptr) +H5HF_cache_iblock_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5HF_indirect_t *iblock, unsigned UNUSED * flags_ptr, H5SL_t UNUSED *collective_write_list) { H5WB_t *wb = NULL; /* Wrapped buffer for indirect block data */ uint8_t iblock_buf[H5HF_IBLOCK_BUF_SIZE]; /* Buffer for indirect block */ @@ -1599,7 +1599,7 @@ *------------------------------------------------------------------------- */ static herr_t -H5HF_cache_dblock_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5HF_direct_t *dblock, unsigned UNUSED * flags_ptr) +H5HF_cache_dblock_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5HF_direct_t *dblock, unsigned UNUSED * flags_ptr, H5SL_t UNUSED *collective_write_list) { herr_t ret_value = SUCCEED; /* Return value */ Index: src/H5HGcache.c =================================================================== --- src/H5HGcache.c (revision 25532) +++ src/H5HGcache.c (working copy) @@ -64,7 +64,8 @@ /* Metadata cache callbacks */ static H5HG_heap_t *H5HG_load(H5F_t *f, hid_t dxpl_id, haddr_t addr, void *udata); static herr_t H5HG_flush(H5F_t *f, hid_t dxpl_id, hbool_t dest, haddr_t addr, - H5HG_heap_t *heap, unsigned UNUSED * flags_ptr); + H5HG_heap_t *heap, unsigned UNUSED * flags_ptr, + H5SL_t *collective_write_list); static herr_t H5HG_dest(H5F_t *f, H5HG_heap_t *heap); static herr_t H5HG_clear(H5F_t *f, H5HG_heap_t *heap, hbool_t destroy); static herr_t H5HG_size(const H5F_t *f, const H5HG_heap_t *heap, size_t *size_ptr); @@ -280,7 +281,7 @@ *------------------------------------------------------------------------- */ static herr_t -H5HG_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5HG_heap_t *heap, unsigned UNUSED * flags_ptr) +H5HG_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5HG_heap_t *heap, unsigned UNUSED * flags_ptr, H5SL_t UNUSED *collective_write_list) { herr_t ret_value = SUCCEED; /* Return value */ Index: src/H5HLcache.c =================================================================== --- src/H5HLcache.c (revision 25532) +++ src/H5HLcache.c (working copy) @@ -38,6 +38,7 @@ #include "H5Eprivate.h" /* Error handling */ #include "H5HLpkg.h" /* Local Heaps */ #include "H5MFprivate.h" /* File memory management */ +#include "H5MMprivate.h" /* Memory management */ #include "H5WBprivate.h" /* Wrapped Buffers */ @@ -71,13 +72,13 @@ /* Metadata cache callbacks */ static void *H5HL_prefix_load(H5F_t *f, hid_t dxpl_id, haddr_t addr, void *udata); static herr_t H5HL_prefix_flush(H5F_t *f, hid_t dxpl_id, hbool_t dest, haddr_t addr, - void *thing, unsigned *flags_ptr); + void *thing, unsigned *flags_ptr, H5SL_t *collective_write_list); static herr_t H5HL_prefix_dest(H5F_t *f, void *thing); static herr_t H5HL_prefix_clear(H5F_t *f, void *thing, hbool_t destroy); static herr_t H5HL_prefix_size(const H5F_t *f, const void *thing, size_t *size_ptr); static void *H5HL_datablock_load(H5F_t *f, hid_t dxpl_id, haddr_t addr, void *udata); static herr_t H5HL_datablock_flush(H5F_t *f, hid_t dxpl_id, hbool_t dest, haddr_t addr, - void *thing, unsigned *flags_ptr); + void *thing, unsigned *flags_ptr, H5SL_t *collective_write_list); static herr_t H5HL_datablock_dest(H5F_t *f, void *thing); static herr_t H5HL_datablock_clear(H5F_t *f, void *thing, hbool_t destroy); static herr_t H5HL_datablock_size(const H5F_t *f, const void *thing, size_t *size_ptr); @@ -391,11 +392,10 @@ */ static herr_t H5HL_prefix_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, - void *thing, unsigned UNUSED *flags_ptr) + void *thing, unsigned UNUSED *flags_ptr, H5SL_t *collective_write_list) { + uint8_t *buf = NULL; /* Pointer to heap buffer */ H5HL_prfx_t *prfx = (H5HL_prfx_t *)thing; /* Local heap prefix to flush */ - H5WB_t *wb = NULL; /* Wrapped buffer for heap data */ - uint8_t heap_buf[H5HL_SPEC_READ_SIZE]; /* Buffer for heap */ herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_NOAPI_NOINIT @@ -407,22 +407,16 @@ if(prfx->cache_info.is_dirty) { H5HL_t *heap = prfx->heap; /* Pointer to the local heap */ - uint8_t *buf; /* Pointer to heap buffer */ size_t buf_size; /* Size of buffer for encoding & writing heap info */ uint8_t *p; /* Pointer into raw data buffer */ - /* Wrap the local buffer for serialized heap info */ - if(NULL == (wb = H5WB_wrap(heap_buf, sizeof(heap_buf)))) - HGOTO_ERROR(H5E_HEAP, H5E_CANTINIT, FAIL, "can't wrap buffer") - /* Compute the size of the buffer to encode & write */ buf_size = heap->prfx_size; if(heap->single_cache_obj) buf_size += heap->dblk_size; - /* Get a pointer to a buffer that's large enough for serialized heap */ - if(NULL == (buf = (uint8_t *)H5WB_actual(wb, buf_size))) - HGOTO_ERROR(H5E_HEAP, H5E_NOSPACE, FAIL, "can't get actual buffer") + if(NULL == (buf = (uint8_t *)malloc(buf_size))) + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, "unable to allocate buffer") /* Update the free block value from the free list */ heap->free_block = heap->freelist ? heap->freelist->offset : H5HL_FREE_NULL; @@ -459,10 +453,29 @@ HDmemcpy(p, heap->dblk_image, heap->dblk_size); } /* end if */ - /* Write the prefix [and possibly the data block] to the file */ - if(H5F_block_write(f, H5FD_MEM_LHEAP, addr, buf_size, dxpl_id, buf) < 0) - HGOTO_ERROR(H5E_HEAP, H5E_WRITEERROR, FAIL, "unable to write heap header and data to file") +#ifdef H5_HAVE_PARALLEL + if(collective_write_list) { + H5AC_collective_write_t *item = NULL; + if(NULL == (item = (H5AC_collective_write_t *)H5MM_malloc(sizeof(H5AC_collective_write_t)))) + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, "unable to allocate skip list item") + item->length = buf_size; + item->free_buf = TRUE; + item->buf = buf; + item->offset = addr; + if(H5SL_insert(collective_write_list, item, &item->offset) < 0) { + H5MM_free(item); + HGOTO_ERROR(H5E_HEAP, H5E_CANTINSERT, FAIL, "unable to insert skip list item") + } /* end if */ + } /* end if */ + else +#endif /* H5_HAVE_PARALLEL */ + { + /* Write the prefix [and possibly the data block] to the file */ + if(H5F_block_write(f, H5FD_MEM_LHEAP, addr, buf_size, dxpl_id, buf) < 0) + HGOTO_ERROR(H5E_HEAP, H5E_WRITEERROR, FAIL, "unable to write heap header and data to file") + } /* end block/else */ + prfx->cache_info.is_dirty = FALSE; } /* end if */ @@ -472,9 +485,8 @@ HGOTO_ERROR(H5E_HEAP, H5E_CANTFREE, FAIL, "unable to destroy local heap prefix") done: - /* Release resources */ - if(wb && H5WB_unwrap(wb) < 0) - HDONE_ERROR(H5E_HEAP, H5E_CLOSEERROR, FAIL, "can't close wrapped buffer") + if(buf && !(collective_write_list && (ret_value == SUCCEED))) + buf = (uint8_t *)H5MM_xfree(buf); FUNC_LEAVE_NOAPI(ret_value) } /* end H5HL_prefix_flush() */ @@ -691,7 +703,7 @@ */ static herr_t H5HL_datablock_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, - void *_thing, unsigned UNUSED * flags_ptr) + void *_thing, unsigned UNUSED * flags_ptr, H5SL_t *collective_write_list) { H5HL_dblk_t *dblk = (H5HL_dblk_t *)_thing; /* Pointer to the local heap data block */ herr_t ret_value = SUCCEED; /* Return value */ @@ -714,17 +726,38 @@ /* Serialize the free list into the heap data's image */ H5HL_fl_serialize(heap); - /* Write the data block to the file */ - if(H5F_block_write(f, H5FD_MEM_LHEAP, heap->dblk_addr, heap->dblk_size, dxpl_id, heap->dblk_image) < 0) - HGOTO_ERROR(H5E_HEAP, H5E_WRITEERROR, FAIL, "unable to write heap data block to file") +#ifdef H5_HAVE_PARALLEL + if(collective_write_list) { + H5AC_collective_write_t *item = NULL; + if(NULL == (item = (H5AC_collective_write_t *)H5MM_malloc(sizeof(H5AC_collective_write_t)))) + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, "unable to allocate skip list item") + item->length = heap->dblk_size; + item->free_buf = FALSE; + item->buf = heap->dblk_image; + item->offset = heap->dblk_addr; + if(H5SL_insert(collective_write_list, item, &item->offset) < 0) { + H5MM_free(item); + HGOTO_ERROR(H5E_HEAP, H5E_CANTINSERT, FAIL, "unable to insert skip list item") + } /* end if */ + } /* end if */ + else +#endif /* H5_HAVE_PARALLEL */ + { + /* Write the data block to the file */ + if(H5F_block_write(f, H5FD_MEM_LHEAP, heap->dblk_addr, heap->dblk_size, dxpl_id, heap->dblk_image) < 0) + HGOTO_ERROR(H5E_HEAP, H5E_WRITEERROR, FAIL, "unable to write heap data block to file") + } /* end block/else */ + dblk->cache_info.is_dirty = FALSE; } /* end if */ /* Should we destroy the memory version? */ - if(destroy) + if(destroy) { + HDassert(!collective_write_list); if(H5HL_datablock_dest(f, dblk) < 0) HGOTO_ERROR(H5E_HEAP, H5E_CANTFREE, FAIL, "unable to destroy local heap data block") + } /* end if */ done: FUNC_LEAVE_NOAPI(ret_value) Index: src/H5Ocache.c =================================================================== --- src/H5Ocache.c (revision 25532) +++ src/H5Ocache.c (working copy) @@ -38,6 +38,7 @@ #include "H5Eprivate.h" /* Error handling */ #include "H5FLprivate.h" /* Free lists */ #include "H5MFprivate.h" /* File memory management */ +#include "H5MMprivate.h" /* Memory management */ #include "H5Opkg.h" /* Object headers */ #include "H5WBprivate.h" /* Wrapped Buffers */ @@ -69,13 +70,13 @@ /* Metadata cache callbacks */ static H5O_t *H5O_load(H5F_t *f, hid_t dxpl_id, haddr_t addr, void *udata); -static herr_t H5O_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5O_t *oh, unsigned UNUSED * flags_ptr); +static herr_t H5O_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5O_t *oh, unsigned UNUSED * flags_ptr, H5SL_t *collective_write_list); static herr_t H5O_dest(H5F_t *f, H5O_t *oh); static herr_t H5O_clear(H5F_t *f, H5O_t *oh, hbool_t destroy); static herr_t H5O_size(const H5F_t *f, const H5O_t *oh, size_t *size_ptr); static H5O_chunk_proxy_t *H5O_cache_chk_load(H5F_t *f, hid_t dxpl_id, haddr_t addr, void *udata); -static herr_t H5O_cache_chk_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5O_chunk_proxy_t *chk_proxy, unsigned UNUSED * flags_ptr); +static herr_t H5O_cache_chk_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5O_chunk_proxy_t *chk_proxy, unsigned UNUSED * flags_ptr, H5SL_t *collective_write_list); static herr_t H5O_cache_chk_dest(H5F_t *f, H5O_chunk_proxy_t *chk_proxy); static herr_t H5O_cache_chk_clear(H5F_t *f, H5O_chunk_proxy_t *chk_proxy, hbool_t destroy); static herr_t H5O_cache_chk_size(const H5F_t *f, const H5O_chunk_proxy_t *chk_proxy, size_t *size_ptr); @@ -370,7 +371,7 @@ *------------------------------------------------------------------------- */ static herr_t -H5O_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t UNUSED addr, H5O_t *oh, unsigned UNUSED * flags_ptr) +H5O_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t UNUSED addr, H5O_t *oh, unsigned UNUSED * flags_ptr, H5SL_t *collective_write_list) { herr_t ret_value = SUCCEED; /* Return value */ @@ -483,19 +484,40 @@ if(H5O_chunk_serialize(f, oh, (unsigned)0) < 0) HGOTO_ERROR(H5E_OHDR, H5E_CANTSERIALIZE, FAIL, "unable to serialize first object header chunk") - /* Write the chunk out */ - HDassert(H5F_addr_defined(oh->chunk[0].addr)); - if(H5F_block_write(f, H5FD_MEM_OHDR, oh->chunk[0].addr, oh->chunk[0].size, dxpl_id, oh->chunk[0].image) < 0) - HGOTO_ERROR(H5E_OHDR, H5E_WRITEERROR, FAIL, "unable to write object header chunk to disk") +#ifdef H5_HAVE_PARALLEL + if(collective_write_list) { + H5AC_collective_write_t *item = NULL; + if(NULL == (item = (H5AC_collective_write_t *)H5MM_malloc(sizeof(H5AC_collective_write_t)))) + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, "unable to allocate skip list item") + item->length = oh->chunk[0].size; + item->free_buf = FALSE; + item->buf = oh->chunk[0].image; + item->offset = oh->chunk[0].addr; + if(H5SL_insert(collective_write_list, item, &item->offset) < 0) { + H5MM_free(item); + HGOTO_ERROR(H5E_HEAP, H5E_CANTINSERT, FAIL, "unable to insert skip list item") + } /* end if */ + } /* end if */ + else +#endif /* H5_HAVE_PARALLEL */ + { + /* Write the chunk out */ + HDassert(H5F_addr_defined(oh->chunk[0].addr)); + if(H5F_block_write(f, H5FD_MEM_OHDR, oh->chunk[0].addr, oh->chunk[0].size, dxpl_id, oh->chunk[0].image) < 0) + HGOTO_ERROR(H5E_OHDR, H5E_WRITEERROR, FAIL, "unable to write object header chunk to disk") + } /* end block/else */ + /* Mark object header as clean now */ oh->cache_info.is_dirty = FALSE; } /* end if */ /* Destroy the object header, if requested */ - if(destroy) + if(destroy) { + HDassert(!collective_write_list); if(H5O_dest(f, oh) < 0) HGOTO_ERROR(H5E_OHDR, H5E_CANTFREE, FAIL, "unable to destroy object header data") + } /* end if */ done: FUNC_LEAVE_NOAPI(ret_value) @@ -779,7 +801,8 @@ */ static herr_t H5O_cache_chk_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, - H5O_chunk_proxy_t *chk_proxy, unsigned UNUSED * flags_ptr) + H5O_chunk_proxy_t *chk_proxy, unsigned UNUSED * flags_ptr, + H5SL_t *collective_write_list) { herr_t ret_value = SUCCEED; /* Return value */ @@ -791,20 +814,41 @@ if(H5O_chunk_serialize(f, chk_proxy->oh, chk_proxy->chunkno) < 0) HGOTO_ERROR(H5E_OHDR, H5E_CANTSERIALIZE, FAIL, "unable to serialize object header continuation chunk") - /* Write the chunk out */ - HDassert(H5F_addr_defined(chk_proxy->oh->chunk[chk_proxy->chunkno].addr)); - HDassert(H5F_addr_eq(addr, chk_proxy->oh->chunk[chk_proxy->chunkno].addr)); - if(H5F_block_write(f, H5FD_MEM_OHDR, addr, chk_proxy->oh->chunk[chk_proxy->chunkno].size, dxpl_id, chk_proxy->oh->chunk[chk_proxy->chunkno].image) < 0) - HGOTO_ERROR(H5E_OHDR, H5E_WRITEERROR, FAIL, "unable to write object header continuation chunk to disk") +#ifdef H5_HAVE_PARALLEL + if(collective_write_list) { + H5AC_collective_write_t *item = NULL; + if(NULL == (item = (H5AC_collective_write_t *)H5MM_malloc(sizeof(H5AC_collective_write_t)))) + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, "unable to allocate skip list item") + item->length = chk_proxy->oh->chunk[chk_proxy->chunkno].size; + item->free_buf = FALSE; + item->buf = chk_proxy->oh->chunk[chk_proxy->chunkno].image; + item->offset = addr; + if(H5SL_insert(collective_write_list, item, &item->offset) < 0) { + H5MM_free(item); + HGOTO_ERROR(H5E_HEAP, H5E_CANTINSERT, FAIL, "unable to insert skip list item") + } /* end if */ + } /* end if */ + else +#endif /* H5_HAVE_PARALLEL */ + { + /* Write the chunk out */ + HDassert(H5F_addr_defined(chk_proxy->oh->chunk[chk_proxy->chunkno].addr)); + HDassert(H5F_addr_eq(addr, chk_proxy->oh->chunk[chk_proxy->chunkno].addr)); + if(H5F_block_write(f, H5FD_MEM_OHDR, addr, chk_proxy->oh->chunk[chk_proxy->chunkno].size, dxpl_id, chk_proxy->oh->chunk[chk_proxy->chunkno].image) < 0) + HGOTO_ERROR(H5E_OHDR, H5E_WRITEERROR, FAIL, "unable to write object header continuation chunk to disk") + } /* end block/else */ + /* Mark object header as clean now */ chk_proxy->cache_info.is_dirty = FALSE; } /* end if */ /* Destroy the object header, if requested */ - if(destroy) + if(destroy) { + HDassert(!collective_write_list); if(H5O_cache_chk_dest(f, chk_proxy) < 0) HGOTO_ERROR(H5E_OHDR, H5E_CANTFREE, FAIL, "unable to destroy object header continuation chunk data") + } /* end if */ done: FUNC_LEAVE_NOAPI(ret_value) Index: src/H5SMcache.c =================================================================== --- src/H5SMcache.c (revision 25532) +++ src/H5SMcache.c (working copy) @@ -54,12 +54,12 @@ /* Metadata cache (H5AC) callbacks */ static H5SM_master_table_t *H5SM_table_load(H5F_t *f, hid_t dxpl_id, haddr_t addr, void *udata); -static herr_t H5SM_table_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5SM_master_table_t *table); +static herr_t H5SM_table_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5SM_master_table_t *table, H5SL_t *collective_write_list); static herr_t H5SM_table_dest(H5F_t *f, H5SM_master_table_t* table); static herr_t H5SM_table_clear(H5F_t *f, H5SM_master_table_t *table, hbool_t destroy); static herr_t H5SM_table_size(const H5F_t *f, const H5SM_master_table_t *table, size_t *size_ptr); static H5SM_list_t *H5SM_list_load(H5F_t *f, hid_t dxpl_id, haddr_t addr, void *udata); -static herr_t H5SM_list_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5SM_list_t *list); +static herr_t H5SM_list_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5SM_list_t *list, H5SL_t *collective_write_list); static herr_t H5SM_list_dest(H5F_t *f, H5SM_list_t* list); static herr_t H5SM_list_clear(H5F_t *f, H5SM_list_t *list, hbool_t destroy); static herr_t H5SM_list_size(const H5F_t *f, const H5SM_list_t UNUSED *list, size_t *size_ptr); @@ -250,7 +250,7 @@ *------------------------------------------------------------------------- */ static herr_t -H5SM_table_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5SM_master_table_t *table) +H5SM_table_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5SM_master_table_t *table, H5SL_t UNUSED *collective_write_list) { H5WB_t *wb = NULL; /* Wrapped buffer for table data */ uint8_t tbl_buf[H5SM_TBL_BUF_SIZE]; /* Buffer for table */ @@ -559,7 +559,7 @@ *------------------------------------------------------------------------- */ static herr_t -H5SM_list_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5SM_list_t *list) +H5SM_list_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5SM_list_t *list, H5SL_t UNUSED *collective_write_list) { H5WB_t *wb = NULL; /* Wrapped buffer for list index data */ uint8_t lst_buf[H5SM_LST_BUF_SIZE]; /* Buffer for list index */