t[ 1 ]); // Make sure all threads have their block_begin result written out. # pragma omp barrier iterator_pair block_begin = block_begins[ iam ]; // Begin working for the first block, while the others except // the last start to count. if (iam == 0) { // The first thread can copy already. lengths[ iam ] = op.invoke(block_begin.first, block_end.first, block_begin.second, block_end.second, result) - result; } else { lengths[ iam ] = op.count(block_begin.first, block_end.first, block_begin.second, block_end.second); } // Make sure everyone wrote their lengths. # pragma omp barrier OutputIterator r = result; if (iam == 0) { // Do the last block. for (int i = 0; i < num_threads; ++i) %%% r += lengths[i]; block_begin = block_begins[num_threads]; // Return the result iterator of the last block. return_value = op.invoke( block_begin.first, end1, block_begin.second, end2, r); } else { for (int i = 0; i < iam; ++i) r += lengths[ i ]; // Reset begins for copy pass. op.invoke(block_begin.first, block_end.first, block_begin.second, block_end.second, r); } } return return_value; } template inline OutputIterator parallel_set_union(InputIterator begin1, InputIterator end1, InputIterator begin2, InputIterator end2, OutputIterator result, Comparator comp) { return parallel_set_operation(begin1, end1, begin2, end2, result, union_func< InputIterator, OutputIterator, Comparator>(comp)); } template inline OutputIterator parallel_set_intersection(InputIterator begin1, InputIterator end1, InputIterator begin2, InputIterator end2, OutputIterator result, Comparator comp) { return parallel_set_operation(begin1, end1, begin2, end2, result, intersection_func(comp)); } template inline OutputIterator set_intersection(InputIterator begin1, InputIterator end1, InputIterator begin2, InputIterator end2, OutputIterator result) { typedef std::iterator_traits traits_type; typedef typename traits_type::value_type value_type; return set_intersection(begin1, end1, begin2, end2, result, std::less()); } template inline OutputIterator parallel_set_difference(InputIterator begin1, InputIterator end1, InputIterator begin2, InputIterator end2, OutputIterator result, Comparator comp) { return parallel_set_operation(begin1, end1, begin2, end2, result, difference_func(comp)); } template inline OutputIterator parallel_set_symmetric_difference(InputIterator begin1, InputIterator end1, InputIterator begin2, InputIterator end2, OutputIterator result, Comparator comp) { return parallel_set_operation(begin1, end1, begin2, end2, result, symmetric_difference_func (comp)); } } #endif // _GLIBCXX_SET_ALGORITHM_ // -*- C++ -*- // Copyright (C) 2007, 2008 Free Software Foundation, Inc. // // This file is part of the GNU ISO C++ Library. This library is free // software; you can redistribute it and/or modify it under the terms // of the GNU General Public License as published by the Free Software // Foundation; either version 2, or (at your option) any later // version. // This library is distributed in the hope that it will be useful, but // WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // General Public License for more details. // You should have received a copy of the GNU General Public License // along with this library; see the file COPYING. If not, write to // the Free Software Foundation, 59 Temple Place - Suite 330, Boston, // MA 02111-1307, USA. // As a special exception, you may use this file as part of a free // software library without restriction. Specifically, if other files // instantiate templates or use macros or inline functions from this // file, or you compile this file and link it with other files to // produce an executable, this file does not by itself cause the // resulting executable to be covered by the GNU General Public // License. This exception does not however invalidate any other // reasons why the executable file might be covered by the GNU General // Public License. /** @file parallel/random_shuffle.h * @brief Parallel implementation of std::random_shuffle(). * This file is a GNU parallel extension to the Standard C++ Library. */ // Written by Johannes Singler. #ifndef _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H #define _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H 1 #include #include #include #include namespace __gnu_parallel { /** @brief Type to hold the index of a bin. * * Since many variables of this type are allocated, it should be * chosen as small as possible. */ typedef unsigned short bin_index; /** @brief Data known to every thread participating in __gnu_parallel::parallel_random_shuffle(). */ template struct DRandomShufflingGlobalData { typedef std::iterator_traits traits_type; typedef typename traits_type::value_type value_type; typedef typename traits_type::difference_type difference_type; /** @brief Begin iterator of the source. */ RandomAccessIterator& source; /** @brief Temporary arrays for each thread. */ value_type** temporaries; /** @brief Two-dimensional array to hold the thread-bin distribution. * * Dimensions (num_threads + 1) x (num_bins + 1). */ difference_type** dist; /** @brief Start indexes of the threads' chunks. */ difference_type* starts; /** @brief Number of the thread that will further process the corresponding bin. */ thread_index_t* bin_proc; /** @brief Number of bins to distribute to. */ int num_bins; /** @brief Number of bits needed to address the bins. */ int num_bits; /** @brief Constructor. */ DRandomShufflingGlobalData(RandomAccessIterator& _source) : source(_source) { } }; /** @brief Local data for a thread participating in __gnu_parallel::parallel_random_shuffle(). */ template struct DRSSorterPU { /** @brief Number of threads participating in total. */ int num_threads; /** @brief Begin index for bins taken care of by this thread. */ bin_index bins_begin; /** @brief End index for bins taken care of by this thread. */ bin_index bins_end; /** @brief Random seed for this thread. */ uint32 seed; /** @brief Pointer to global data. */ DRandomShufflingGlobalData* sd; }; /** @brief Generate a random number in @c [0,2^logp). * @param logp Logarithm (basis 2) of the upper range bound. * @param rng Random number generator to use. */ template inline int random_number_pow2(int logp, RandomNumberGenerator& rng) { return rng.genrand_bits(logp); } /** @brief Random shuffle code executed by each thread. * @param pus Array of thread-local data records. */ template void parallel_random_shuffle_drs_pu(DRSSorterPU* pus) { typedef std::iterator_traits traits_type; typedef typename traits_type::value_type value_type; typedef typename traits_type::difference_type difference_type; thread_index_t iam = omp_get_thread_num(); DRSSorterPU* d = &pus[iam]; DRandomShufflingGlobalData* sd = d->sd; // Indexing: dist[bin][processor] difference_type length = sd->starts[iam + 1] - sd->starts[iam]; bin_index* oracles = new bin_index[length]; difference_type* dist = new difference_type[sd->num_bins + 1]; bin_index* bin_proc = new bin_index[sd->num_bins]; value_type** temporaries = new value_type*[d->num_threads]; // Compute oracles and count appearances. for (bin_index b = 0; b < sd->num_bins + 1; ++b) dist[b] = 0; int num_bits = sd->num_bits; random_number rng(d->seed); // First main loop. for (difference_type i = 0; i < length; ++i) { bin_index oracle = random_number_pow2(num_bits, rng); oracles[i] = oracle; // To allow prefix (partial) sum. ++(dist[oracle + 1]); } for (bin_index b = 0; b < sd->num_bins + 1; ++b) sd->dist[b][iam + 1] = dist[b]; # pragma omp barrier # pragma omp single { // Sum up bins, sd->dist[s + 1][d->num_threads] now contains the // total number of items in bin s for (bin_index s = 0; s < sd->num_bins; ++s) __gnu_sequential::partial_sum(sd->dist[s + 1], sd->dist[s + 1] + d->num_threads + 1, sd->dist[s + 1]); } # pragma omp barrier sequence_index_t offset = 0, global_offset = 0; for (bin_index s = 0; s < d->bins_begin; ++s) global_offset += sd->dist[s + 1][d->num_threads]; # pragma omp barrier for (bin_index s = d->bins_begin; s < d->bins_end; ++s) { for (int t = 0; t < d->num_threads + 1; ++t) sd->dist[s + 1][t] += offset; offset = sd->dist[s + 1][d->num_threads]; } sd->temporaries[iam] = static_cast( ::operator new(sizeof(value_type) * offset)); # pragma omp barrier // Draw local copies to avoid false sharing. for (bin_index b = 0; b < sd->num_bins + 1; ++b) dist[b] = sd->dist[b][iam]; for (bin_index b = 0; b < sd->num_bins; ++b) bin_proc[b] = sd->bin_proc[b]; for (thread_index_t t = 0; t < d->num_threads; ++t) temporaries[t] = sd->temporaries[t]; RandomAccessIterator source = sd->source; difference_type start = sd->starts[iam]; // Distribute according to oracles, second main loop. for (difference_type i = 0; i < length; ++i) { bin_index target_bin = oracles[i]; thread_index_t target_p = bin_proc[target_bin]; // Last column [d->num_threads] stays unchanged. ::new(&(temporaries[target_p][dist[target_bin + 1]++])) value_type(*(source + i + start)); } delete[] oracles; delete[] dist; delete[] bin_proc; delete[] temporaries; # pragma omp barrier // Shuffle bins internally. for (bin_index b = d->bins_begin; b < d->bins_end; ++b) { value_type* begin = sd->temporaries[iam] + ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]), * end = sd->temporaries[iam] + sd->dist[b + 1][d->num_threads]; sequential_random_shuffle(begin, end, rng); std::copy(begin, end, sd->source + global_offset + ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads])); } ::operator delete(sd->temporaries[iam]); } /** @brief Round up to the next greater power of 2. * @param x Integer to round up */ template T round_up_to_pow2(T x) { if (x <= 1) return 1; else return (T)1 << (log2(x - 1) + 1); } /** @brief Main parallel random shuffle step. * @param begin Begin iterator of sequence. * @param end End iterator of sequence. * @param n Length of sequence. * @param num_threads Number of threads to use. * @param rng Random number generator to use. */ template void parallel_random_shuffle_drs(RandomAccessIterator begin, RandomAccessIterator end, typename std::iterator_traits ::difference_type n, thread_index_t num_threads, RandomNumberGenerator& rng) { typedef std::iterator_traits traits_type; typedef typename traits_type::value_type value_type; typedef typename traits_type::difference_type difference_type; _GLIBCXX_CALL(n) const _Settings& __s = _Settings::get(); if (num_threads > n) num_threads = static_cast(n); bin_index num_bins, num_bins_cache; #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 // Try the L1 cache first. // Must fit into L1. num_bins_cache = std::max( 1, n / (__s.L1_cache_size_lb / sizeof(value_type))); num_bins_cache = round_up_to_pow2(num_bins_cache); // No more buckets than TLB entries, power of 2 // Power of 2 and at least one element per bin, at most the TLB size. num_bins = std::min(n, num_bins_cache); #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB // 2 TLB entries needed per bin. num_bins = std::min(__s.TLB_size / 2, num_bins); #endif num_bins = round_up_to_pow2(num_bins); if (num_bins < num_bins_cache) { #endif // Now try the L2 cache // Must fit into L2 num_bins_cache = static_cast(std::max( 1, n / (__s.L2_cache_size / sizeof(value_type)))); num_bins_cache = round_up_to_pow2(num_bins_cache); // No more buckets than TLB entries, power of 2. num_bins = static_cast( std::min(n, static_cast(num_bins_cache))); // Power of 2 and at least one element per bin, at most the TLB size. #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB // 2 TLB entries needed per bin. num_bins = std::min( static_cast(__s.TLB_size / 2), num_bins); #endif num_bins = round_up_to_pow2(num_bins); #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 } #endif num_threads = std::min(num_threads, num_bins); if (num_threads <= 1) return sequential_random_shuffle(begin, end, rng); DRandomShufflingGlobalData sd(begin); DRSSorterPU* pus; difference_type* starts; # pragma omp parallel num_threads(num_threads) { thread_index_t num_threads = omp_get_num_threads(); # pragma omp single { pus = new DRSSorterPU [num_threads]; sd.temporaries = new value_type*[num_threads]; sd.dist = new difference_type*[num_bins + 1]; sd.bin_proc = new thread_index_t[num_bins]; for (bin_index b = 0; b < num_bins + 1; ++b) sd.dist[b] = new difference_type[num_threads + 1]; for (bin_index b = 0; b < (num_bins + 1); ++b) { sd.dist[0][0] = 0; sd.dist[b][0] = 0; } starts = sd.starts = new difference_type[num_threads + 1]; int bin_cursor = 0; sd.num_bins = num_bins; sd.num_bits = log2(num_bins); difference_type chunk_length = n / num_threads, split = n % num_threads, start = 0; difference_type b%%%%%%in_chunk_length = num_bins / num_threads, bin_split = num_bins % num_threads; for (thread_index_t i = 0; i < num_threads; ++i) { starts[i] = start; start += (i < split) ? (chunk_length + 1) : chunk_length; int j = pus[i].bins_begin = bin_cursor; // Range of bins for this processor. bin_cursor += (i < bin_split) ? (bin_chunk_length + 1) : bin_chunk_length; pus[i].bins_end = bin_cursor; for (; j < bin_cursor; ++j) sd.bin_proc[j] = i; pus[i].num_threads = num_threads; pus[i].seed = rng(std::numeric_limits::max()); pus[i].sd = &sd; } starts[num_threads] = start; } //single // Now shuffle in parallel. parallel_random_shuffle_drs_pu(pus); } // parallel delete[] starts; delete[] sd.bin_proc; for (int s = 0; s < (num_bins + 1); ++s) delete[] sd.dist[s]; delete[] sd.dist; delete[] sd.temporaries; delete[] pus; } /** @brief Sequential cache-efficient random shuffle. * @param begin Begin iterator of sequence. * @param end End iterator of sequence. * @param rng Random number generator to use. */ template void sequential_random_shuffle(RandomAccessIterator begin, RandomAccessIterator end, RandomNumberGenerator& rng) { typedef std::iterator_traits traits_type; typedef typename traits_type::value_type value_type; typedef typename traits_type::difference_type difference_type; difference_type n = end - begin; const _Settings& __s = _Settings::get(); bin_index num_bins, num_bins_cache; #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 // Try the L1 cache first, must fit into L1. num_bins_cache = std::max (1, n / (__s.L1_cache_size_lb / sizeof(value_type))); num_bins_cache = round_up_to_pow2(num_bins_cache); // No more buckets than TLB entries, power of 2 // Power of 2 and at least one element per bin, at most the TLB size num_bins = std::min(n, (difference_type)num_bins_cache); #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB // 2 TLB entries needed per bin num_bins = std::min((difference_type)__s.TLB_size / 2, num_bins); #endif num_bins = round_up_to_pow2(num_bins); if (num_bins < num_bins_cache) { #endif // Now try the L2 cache, must fit into L2. num_bins_cache = static_cast(std::max( 1, n / (__s.L2_cache_size / sizeof(value_type)))); num_bins_cache = round_up_to_pow2(num_bins_cache); // No more buckets than TLB entries, power of 2 // Power of 2 and at least one element per bin, at most the TLB size. num_bins = static_cast (std::min(n, static_cast(num_bins_cache))); #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB // 2 TLB entries needed per bin num_bins = std::min(__s.TLB_size / 2, num_bins); #endif num_bins = round_up_to_pow2(num_bins); #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 } #endif int num_bits = log2(num_bins); if (num_bins > 1) { value_type* target = static_cast( ::operator new(sizeof(value_type) * n)); bin_index* oracles = new bin_index[n]; difference_type* dist0 = new difference_type[num_bins + 1], * dist1 = new difference_type[num_bins + 1]; for (int b = 0; b < num_bins + 1; ++b) dist0[b] = 0; random_number bitrng(rng(0xFFFFFFFF)); for (difference_type i = 0; i < n; ++i) { bin_index oracle = random_number_pow2(num_bits, bitrng); oracles[i] = oracle; // To allow prefix (partial) sum. ++(dist0[oracle + 1]); } // Sum up bins. __gnu_sequential::partial_sum(dist0, dist0 + num_bins + 1, dist0); for (int b = 0; b < num_bins + 1; ++b) dist1[b] = dist0[b]; // Distribute according to oracles. for (difference_type i = 0; i < n; ++i) ::new(&(target[(dist0[oracles[i]])++])) value_type(*(begin + i)); for (int b = 0; b < num_bins; ++b) { sequential_random_shuffle(target + dist1[b], target + dist1[b + 1], rng); } // Copy elements back. std::copy(target, target + n, begin); delete[] dist0; delete[] dist1; delete[] oracles; ::operator delete(target); } else __gnu_sequential::random_shuffle(begin, end, rng); } /** @brief Parallel random public call. * @param begin Begin iterator of sequence. * @param end End iterator of sequence. * @param rng Random number generator to use. */ template inline void parallel_random_shuffle(RandomAccessIterator begin, RandomAccessIterator end, RandomNumberGenerator rng = random_number()) { typedef std::iterator_traits traits_type; typedef typename traits_type::difference_type difference_type; difference_type n = end - begin; parallel_random_shuffle_drs(begin, end, n, get_max_threads(), rng) ; } } #endif // -*- C++ -*- // Copyright (C) 2007 Free Software Foundation, Inc. // // This file is part of the GNU ISO C++ Library. This library is free // software; you can redistribute it and/or modify it under the terms // of the GNU General Public License as published by the Free Software // Foundation; either version 2, or (at your option) any later // version. // This library is distributed in the hope that it will be useful, but // WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // General Public License for more details. // You should have received a copy of the GNU General Public License // along with this library; see the file COPYING. If not, write to // the Free Software Foundation, 59 Temple Place - Suite 330, Boston, // MA 02111-1307, USA. // As a special exception, you may use this file as part of a free // software library without restriction. Specifically, if other files // instantiate templates or use macros or inline functions from this // file, or you compile this file and link it with other files to // produce an executable, this file does not by itself cause the // resulting executable to be covered by the GNU General Public // License. This exception does not however invalidate any other // reasons why the executable file might be covered by the GNU General // Public License. /** @file parallel/features.h * @brief Defines on whether to include algorithm variants. * * Less variants reduce executable size and compile time. * This file is a GNU parallel extension to the Standard C++ Library. */ // Written by Johannes Singler. #ifndef _GLIBCXX_PARALLEL_FEATURES_H #define _GLIBCXX_PARALLEL_FEATURES_H 1 #ifndef _GLIBCXX_MERGESORT /** @def _GLIBCXX_MERGESORT * @brief Include parallel multi-way mergesort. * @see __gnu_parallel::_Settings::sort_algorithm */ #define _GLIBCXX_MERGESORT 1 #endif #ifndef _GLIBCXX_QUICKSORT /** @def _GLIBCXX_QUICKSORT * @brief Include parallel unbalanced quicksort. * @see __gnu_parallel::_Settings::sort_algorithm */ #define _GLIBCXX_QUICKSORT 1 #endif #ifndef _GLIBCXX_BAL_QUICKSORT /** @def _GLIBCXX_BAL_QUICKSORT * @brief Include parallel dynamically load-balanced quicksort. * @see __gnu_parallel::_Settings::sort_algorithm */ #define _GLIBCXX_BAL_QUICKSORT 1 #endif #ifndef _GLIBCXX_FIND_GROWING_BLOCKS /** @brief Include the growing blocks variant for std::find. * @see __gnu_parallel::_Settings::find_algorithm */ #define _GLIBCXX_FIND_GROWING_BLOCKS 1 #endif #ifndef _GLIBCXX_FIND_CONSTANT_SIZE_BLOCKS /** @brief Include the equal-sized blocks variant for std::find. * @see __gnu_parallel::_Settings::find_algorithm */ #define _GLIBCXX_FIND_CONSTANT_SIZE_BLOCKS 1 #endif #ifndef _GLIBCXX_FIND_EQUAL_SPLIT /** @def _GLIBCXX_FIND_EQUAL_SPLIT * @brief Include the equal splitting variant for std::find. * @see __gnu_parallel::_Settings::find_algorithm */ #define _GLIBCXX_FIND_EQUAL_SPLIT 1 #endif #ifndef _GLIBCXX_TREE_INITIAL_SPLITTING /** @def _GLIBCXX_TREE_INITIAL_SPLITTING * @brief Include the initial splitting variant for * _Rb_tree::insert_unique(InputIterator beg, InputIterator end). * @see __gnu_parallel::_Rb_tree */ #define _GLIBCXX_TREE_INITIAL_SPLITTING 1 #endif #ifndef _GLIBCXX_TREE_DYNAMIC_BALANCING /** @def _GLIBCXX_TREE_DYNAMIC_BALANCING * @brief Include the dynamic balancing variant for * _Rb_tree::insert_unique(InputIterator beg, InputIterator end). * @see __gnu_parallel::_Rb_tree */ #define _GLIBCXX_TREE_DYNAMIC_BALANCING 1 #endif #ifndef _GLIBCXX_TREE_FULL_COPY /** @def _GLIBCXX_TREE_FULL_COPY * @brief In order to sort the input sequence of * _Rb_tree::insert_unique(InputIterator beg, InputIterator end) a * full copy of the input elements is done. * @see __gnu_parallel::_Rb_tree */ #define _GLIBCXX_TREE_FULL_COPY 1 #endif #endif // -*- C++ -*- // Copyright (C) 2007, 2008 Free Software Foundation, Inc. // // This file is part of the GNU ISO C++ Library. This library is free // software; you can redistribute it and/or modify it under the terms // of the GNU General Public License as published by the Free Software // Foundation; either version 2, or (at your option) any later // version. // This library is distributed in the hope that it will be useful, but // WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // General Public License for more details. // You should have received a copy of the GNU General Public License // along with this library; see the file COPYING. If not, write to // the Free Software Foundation, 59 Temple Place - Suite 330, Boston, // MA 02111-1307, USA. // As a special exception, you may use this file as part of a free // software library without restriction. Specifically, if other files // instantiate templates or use macros or inline functions from this // file, or you compile this file and link it with other files to // produce an executable, this file does not by itself cause the // resulting executable to be covered by the GNU General Public // License. This exception does not however invalidate any other // reasons why the executable file might be covered by the GNU General // Public License. /** @file parallel/balanced_quicksort.h * @brief Implementation of a dynamically load-balanced parallel quicksort. * * It works in-place and needs only logarithmic extra memory. * The algorithm is similar to the one proposed in * * P. Tsigas and Y. Zhang. * A simple, fast parallel implementation of quicksort and * its performance evaluation on SUN enterprise 10000. * In 11th Euromicro Conference on Parallel, Distributed and * Network-Based Processing, page 372, 2003. * * This file is a GNU parallel extension to the Standard C++ Library. */ // Written by Johannes Singler. #ifndef _GLIBCXX_PARALLEL_BAL_QUICKSORT_H #define _GLIBCXX_PARALLEL_BAL_QUICKSORT_H 1 #include #include #include #include #include #include #include #if _GLIBCXX_ASSERTIONS #include #endif namespace __gnu_parallel { /** @brief Information local to one thread in the parallel quicksort run. */ template struct QSBThreadLocal { typedef std::iterator_traits traits_type; typedef typename traits_type::difference_type difference_type; /** @brief Continuous part of the sequence, described by an iterator pair. */ typedef std::pair Piece; /** @brief Initial piece to work on. */ Piece initial; /** @brief Work-stealing queue. */ RestrictedBoundedConcurrentQueue leftover_parts; /** @brief Number of threads involved in this algorithm. */ thread_index_t num_threads; /** @brief Pointer to a counter of elements left over to sort. */ volatile difference_type* elements_leftover; /** @brief The complete sequence to sort. */ Piece global; /** @brief Constructor. * @param queue_size Size of the work-stealing queue. */ QSBThreadLocal(int queue_size) : leftover_parts(queue_size) { } }; /** @brief Balanced quicksort divide step. * @param begin Begin iterator of subsequence. * @param end End iterator of subsequence. * @param comp Comparator. * @param num_threads Number of threads that are allowed to work on * this part. * @pre @c (end-begin)>=1 */ template typename std::iterator_traits::difference_type qsb_divide(RandomAccessIterator begin, RandomAccessIterator end, Comparator comp, thread_index_t num_threads) { _GLIBCXX_PARALLEL_ASSERT(num_threads > 0); typedef std::iterator_traits traits_type; typedef typename traits_type::value_type value_type; typedef typename traits_type::difference_type difference_type; RandomAccessIterator pivot_pos = median_of_three_iterators(begin, begin + (end - begin) / 2, end - 1, comp); #if defined(_GLIBCXX_ASSERTIONS) // Must be in between somewhere. difference_type n = end - begin; _GLIBCXX_PARALLEL_ASSERT( (!comp(*pivot_pos, *begin) && !comp(*(begin + n / 2), *pivot_pos)) || (!comp(*pivot_pos, *begin) && !comp(*(end - 1), *pivot_pos)) || (!comp(*pivot_pos, *(begin + n / 2)) && !comp(*begin, *pivot_pos)) || (!comp(*pivot_pos, *(begin + n / 2)) && !comp(*(end - 1), *pivot_pos)) || (!comp(*pivot_pos, *(end - 1)) && !comp(*begin, *pivot_pos)) || (!comp(*pivot_pos, *(end - 1)) && !comp(*(begin + n / 2), *pivot_pos))); #endif // Swap pivot value to end. if (pivot_pos != (end - 1)) std::swap(*pivot_pos, *(end - 1)); pivot_pos = end - 1; __gnu_parallel::binder2nd pred(comp, *pivot_pos); // Divide, returning end - begin - 1 in the worst case. difference_type split_pos = parallel_partition( begin, end - 1, pred, num_threads); // Swap back pivot to middle. std::swap(*(begin + split_pos), *pivot_pos); pivot_pos = begin + split_pos; #if _GLIBCXX_ASSERTIONS RandomAccessIterator r; for (r = begin; r != pivot_pos; ++r) _GLIBCXX_PARALLEL_ASSERT(comp(*r, *pivot_pos)); for (; r != end; ++r) _GLIBCXX_PARALLEL_ASSERT(!comp(*r, *pivot_pos)); #endif return split_pos; } /** @brief Quicksort conquer step. * @param tls Array of thread-local storages. * @param begin Begin iterator of subsequence. * @param end End iterator of subsequence. * @param comp Comparator. * @param iam Number of the thread processing this function. * @param num_threads * Number of threads that are allowed to work on this part. */ template void qsb_conquer(QSBThreadLocal** tls, RandomAccessIterator begin, RandomAccessIterator end, Comparator comp, thread_index_t iam, thread_index_t num_threads, bool parent_wait) { typedef std::iterator_traits traits_type; typedef typename traits_type::value_type value_type; typedef typename traits_type::difference_type difference_type; difference_type n = end - begin; if (num_threads <= 1 || n <= 1) { tls[iam]->initial.first = begin; tls[iam]->initial.second = end; qsb_local_sort_with_helping(tls, comp, iam, parent_wait); return; } // Divide step. difference_type split_pos = qsb_divide(begin, end, comp, num_threads); #if _GLIBCXX_ASSERTIONS _GLIBCXX_PARALLEL_ASSERT(0 <= split_pos && split_pos < (end - begin)); #endif thread_index_t num_threads_leftside = std::max(1, std::min( num_threads - 1, split_pos * num_threads / n)); # pragma omp atomic *tls[iam]->elements_leftover -= (difference_type)1; // Conquer step. # pragma omp parallel num_threads(2) { bool wait; if(omp_get_num_threads() < 2) wait = false; else wait = parent_wait; # pragma omp sections { # pragma omp section { qsb_conquer(tls, begin, begin + split_pos, comp, iam, num_threads_leftside, wait); wait = parent_wait; } // The pivot_pos is left in place, to ensure termination. # pragma omp section { qsb_conquer(tls, begin + split_pos + 1, end, comp, iam + num_threads_leftside, num_threads - num_threads_leftside, wait); wait = parent_wait; } } } } /** * @brief Quicksort step doing load-balanced local sort. * @param tls Array of thread-local storages. * @param comp Comparator. * @param iam Number of the thread processing this function. */ template void qsb_local_sort_with_helping(QSBThreadLocal** tls, Comparator& comp, int iam, bool wait) { typedef std::iterator_traits traits_type; typedef typename traits_type::value_type value_type; typedef typename traits_type::difference_type difference_type; typedef std::pair Piece; QSBThreadLocal& tl = *tls[iam]; difference_type base_case_n = _Settings::get().sort_qsb_base_case_maximal_n; if (base_case_n < 2) base_case_n = 2; thread_index_t num_threads = tl.num_threads; // Every thread has its own random number generator. random_number rng(iam + 1); Piece current = tl.initial; difference_type elements_done = 0; #if _GLIBCXX_ASSERTIONS difference_type total_elements_done = 0; #endif for (;;) { // Invariant: current must be a valid (maybe empty) range. RandomAccessIterator begin = current.first, end = current.second; difference_type n = end - begin; if (n > base_case_n) { // Divide. RandomAccessIterator pivot_pos = begin + rng(n); // Swap pivot_pos value to end. if (pivot_pos != (end - 1)) std::swap(*pivot_pos, *(end - 1)); pivot_pos = end - 1; __gnu_parallel::binder2nd pred(comp, *pivot_pos); // Divide, leave pivot unchanged in last place. RandomAccessIterator split_pos1, split_pos2; split_pos1 = __gnu_sequential::partition(begin, end - 1, pred); // Left side: < pivot_pos; right side: >= pivot_pos. #if _GLIBCXX_ASSERTIONS _GLIBCXX_PARALLEL_ASSERT(begin <= split_pos1 && split_pos1 < end); #endif // Swap pivot back to middle. if (split_pos1 != pivot_pos) std::swap(*split_pos1, *pivot_pos); pivot_pos = split_pos1; // In case all elements are equal, split_pos1 == 0. if ((split_pos1 + 1 - begin) < (n >> 7) || (end - split_pos1) < (n >> 7)) { // Very unequal split, one part smaller than one 128th // elements not strictly larger than the pivot. __gnu_parallel::unary_negate<__gnu_parallel::binder1st , value_type> pred(__gnu_parallel::binder1st (comp, *pivot_pos)); // Find other end of pivot-equal range. split_pos2 = __gnu_sequential::partition(split_pos1 + 1, end, pred); } else // Only skip the pivot. split_pos2 = split_pos1 + 1; // Elements equal to pivot are done. elements_done += (split_pos2 - split_pos1); #if _GLIBCXX_ASSERTIONS total_elements_done += (split_pos2 - split_pos1); #endif // Always push larger part onto stack. if (((split_pos1 + 1) - begin) < (end - (split_pos2))) { // Right side larger. if ((split_pos2) != end) tl.leftover_parts.push_front(std::make_pair(split_pos2, end)); //current.first = begin; //already set anyway current.second = split_pos1; continue; } else { // Left side larger. if (begin != split_pos1) tl.leftover_parts.push_front(std::make_pair(begin, split_pos1)); current.first = sp)%*%+%,%-%lit_pos2; //current.second = end; //already set anyway continue; } } else { __gnu_sequential::sort(begin, end, comp); elements_done += n; #if _GLIBCXX_ASSERTIONS total_elements_done += n; #endif // Prefer own stack, small pieces. if (tl.leftover_parts.pop_front(current)) continue; # pragma omp atomic *tl.elements_leftover -= elements_done; elements_done = 0; #if _GLIBCXX_ASSERTIONS double search_start = omp_get_wtime(); #endif // Look for new work. bool successfully_stolen = false; while (wait && *tl.elements_leftover > 0 && !successfully_stolen #if _GLIBCXX_ASSERTIONS // Possible dead-lock. && (omp_get_wtime() < (search_start + 1.0)) #endif ) { thread_index_t victim; victim = rng(num_threads); // Large pieces. successfully_stolen = (victim != iam) && tls[victim]->leftover_parts.pop_back(current); if (!successfully_stolen) yield(); #if !defined(__ICC) && !defined(__ECC) # pragma omp flush #endif } #if _GLIBCXX_ASSERTIONS if (omp_get_wtime() >= (search_start + 1.0)) { sleep(1); _GLIBCXX_PARALLEL_ASSERT(omp_get_wtime() < (search_start + 1.0)); } #endif if (!successfully_stolen) { #if _GLIBCXX_ASSERTIONS _GLIBCXX_PARALLEL_ASSERT(*tl.elements_leftover == 0); #endif return; } } } } /** @brief Top-level quicksort routine. * @param begin Begin iterator of sequence. * @param end End iterator of sequence. * @param comp Comparator. * @param n Length of the sequence to sort. * @param num_threads Number of threads that are allowed to work on * this part. */ template void parallel_sort_qsb(RandomAccessIterator begin, RandomAccessIterator end, Comparator comp, typename std::iterator_traits ::difference_type n, thread_index_t num_threads) { _GLIBCXX_CALL(end - begin) typedef std::iterator_traits traits_type; typedef typename traits_type::value_type value_type; typedef typename traits_type::difference_type difference_type; typedef std::pair Piece; typedef QSBThreadLocal tls_type; if (n <= 1) return; // At least one element per processor. if (num_threads > n) num_threads = static_cast(n); // Initialize thread local storage tls_type** tls = new tls_type*[num_threads]; difference_type queue_size = num_threads * (thread_index_t)(log2(n) + 1); for (thread_index_t t = 0; t < num_threads; ++t) tls[t] = new QSBThreadLocal(queue_size); // There can never be more than ceil(log2(n)) ranges on the stack, because // 1. Only one processor pushes onto the stack // 2. The largest range has at most length n // 3. Each range is larger than half of the range remaining volatile difference_type elements_leftover = n; for (int i = 0; i < num_threads; ++i) { tls[i]->elements_leftover = &elements_leftover; tls[i]->num_threads = num_threads; tls[i]->global = std::make_pair(begin, end); // Just in case nothing is left to assign. tls[i]->initial = std::make_pair(end, end); } // Main recursion call. qsb_conquer(tls, begin, begin + n, comp, 0, num_threads, true); #if _GLIBCXX_ASSERTIONS // All stack must be empty. Piece dummy; for (int i = 1; i < num_threads; ++i) _GLIBCXX_PARALLEL_ASSERT(!tls[i]->leftover_parts.pop_back(dummy)); #endif for (int i = 0; i < num_threads; ++i) delete tls[i]; delete[] tls; } } // namespace __gnu_parallel #endif // -*- C++ -*- // Copyright (C) 2007, 2008 Free Software Foundation, Inc. // // This file is part of the GNU ISO C++ Library. This library is free // software; you can redistribute it and/or modify it under the terms // of the GNU General Public License as published by the Free Software // Foundation; either version 2, or (at your option) any later // version. // This library is distributed in the hope that it will be useful, but // WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // General Public License for more details. // You should have received a copy of the GNU General Public License // along with this library; see the file COPYING. If not, write to // the Free Software Foundation, 59 Temple Place - Suite 330, Boston, // MA 02111-1307, USA. // As a special exception, you may use this file as part of a free // software library without restriction. Specifically, if other files // instantiate templates or use macros or inline functions from this // file, or you compile this file and link it with other files to // produce an executable, this file does not by itself cause the // resulting executable to be covered by the GNU General Public // License. This exception does not however invalidate any other // reasons why the executable file might be covered by the GNU General // Public License. /** @file parallel/unique_copy.h * @brief Parallel implementations of std::unique_copy(). * This file is a GNU parallel extension to the Standard C++ Library. */ // Written by Robert Geisberger and Robin Dapp. #ifndef _GLIBCXX_PARALLEL_UNIQUE_H #define _GLIBCXX_PARALLEL_UNIQUE_H 1 #include #include namespace __gnu_parallel { /** @brief Parallel std::unique_copy(), w/o explicit equality predicate. * @param first Begin iterator of input sequence. * @param last End iterator of input sequence. * @param result Begin iterator of result sequence. * @param binary_pred Equality predicate. * @return End iterator of result sequence. */ template OutputIterator parallel_unique_copy(InputIterator first, InputIterator last, OutputIterator result, BinaryPredicate binary_pred) { _GLIBCXX_CALL(last - first) typedef std::iterator_traits traits_type; typedef typename traits_type::value_type value_type; typedef typename traits_type::difference_type difference_type; difference_type size = last - first; if (size == 0) return result; // Let the first thread process two parts. difference_type *counter; difference_type *borders; thread_index_t num_threads = get_max_threads(); // First part contains at least one element. # pragma omp parallel num_threads(num_threads) { # pragma omp single { num_threads = omp_get_num_threads(); borders = new difference_type[num_threads + 2]; equally_split(size, num_threads + 1, borders); counter = new difference_type[num_threads + 1]; } thread_index_t iam = omp_get_thread_num(); difference_type begin, end; // Check for length without duplicates // Needed for position in output difference_type i = 0; OutputIterator out = result; if (iam == 0) { begin = borders[0] + 1; // == 1 end = borders[iam + 1]; ++i; *out++ = *first; for (InputIterator iter = first + begin; iter < first + end; ++iter) { if (!binary_pred(*iter, *(iter-1))) { ++i; *out++ = *iter; } } } else { begin = borders[iam]; //one part end = borders[iam + 1]; for (InputIterator iter = first + begin; iter < first + end; ++iter) { if (!binary_pred(*iter, *(iter - 1))) ++i; } } counter[iam] = i; // Last part still untouched. difference_type begin_output; # pragma omp barrier // Store result in output on calculated positions. begin_output = 0; if (iam == 0) { for (int t = 0; t < num_threads; ++t) begin_output += counter[t]; i = 0; OutputIterator iter_out = result + begin_output; begin = borders[num_threads]; end = size; for (InputIterator iter = first + begin; iter < first + end; ++iter) { if (iter == first || !binary_pred(*iter, *(iter - 1))) { ++i; *iter_out++ = *iter; } } counter[num_threads] = i; } else { for (int t = 0; t < iam; t++) begin_output += counter[t]; OutputIterator iter_out = result + begin_output; for (InputIterator iter = first + begin; iter < first + end; ++iter) { if (!binary_pred(*iter, *(iter-1))) *iter_out++ = *iter; } } } difference_type end_output = 0; for (int t = 0; t < num_threads + 1; t++) end_output += counter[t]; delete[] borders; return result + end_output; } /** @brief Parallel std::unique_copy(), without explicit equality predicate * @param first Begin iterator of input sequence. * @param last End iterator of input sequence. * @param result Begin iterator of result sequence. * @return End iterator of result sequence. */ template inline OutputIterator parallel_unique_copy(InputIterator first, InputIterator last, OutputIterator result) { typedef typename std::iterator_traits::value_type value_type; return parallel_unique_copy(first, last, result, std::equal_to()); } }//namespace __gnu_parallel #endif // -*- C++ -*- // Copyright (C) 2007, 2008 Free Software Foundation, Inc. // // This file is part of the GNU ISO C++ Library. This library is free // software; you can redistribute it and/or modify it under the terms // of the GNU General Public License as published by the Free Software // Foundation; either version 2, or (at your option) any later // version. // This library is distributed in the hope that it will be useful, but // WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // General Public License for more details. // You should have received a copy of the GNU General Public License // along with this library; see the file COPYING. If not, write to // the Free Software Foundation, 59 Temple Place - Suite 330, Boston, // MA 02111-1307, USA. // As a special exception, you may use this file as part of a free // software library without restriction. Specifically, if other files // instantiate templates or use macros or inline functions from this // file, or you compile this file and link it with other files to // produce an executable, this file does not by itself cause the // resulting executable to be covered by the GNU General Public // License. This exception does not however invalidate any other // reasons why the executable file might be covered by the GNU General // Public License. /** @file parallel/types.h * @brief Basic types and typedefs. * This file is a GNU parallel extension to the Standard C++ Library. */ // Written by Johannes Singler and Felix Putze. #ifndef _GLIBCXX_PARALLEL_TYPES_H #define _GLIBCXX_PARALLEL_TYPES_H 1 #include namespace __gnu_parallel { // Enumerated types. /// Run-time equivalents for the compile-time tags. enum _Parallelism { /// Not parallel. sequential, /// Parallel unbalanced (equal-sized chunks). parallel_unbalanced, /// Parallel balanced (work-stealing). parallel_balanced, /// Parallel with OpenMP dynamic load-balancing. parallel_omp_loop, /// Parallel with OpenMP static load-balancing. parallel_omp_loop_static, /// Parallel with OpenMP taskqueue construct. parallel_taskqueue }; /// Strategies for run-time algorithm selection: // force_sequential, force_parallel, heuristic. enum _AlgorithmStrategy { heuristic, force_sequential, force_parallel }; /// Sorting algorithms: // multi-way mergesort, quicksort, load-balanced quicksort. enum _SortAlgorithm { MWMS, QS, QS_BALANCED }; /// Merging algorithms: // bubblesort-alike, loser-tree variants, enum sentinel. enum _MultiwayMergeAlgorithm { LOSER_TREE }; /// Partial sum algorithms: recursive, linear. enum _PartialSumAlgorithm { RECURSIVE, LINEAR }; /// Sorting/merging algorithms: sampling, exact. enum _SplittingAlgorithm { SAMPLING, EXACT }; /// Find algorithms: // growing blocks, equal-sized blocks, equal splitting. enum _FindAlgorithm { GROWING_BLOCKS, CONSTANT_SIZE_BLOCKS, EQUAL_SPLIT }; /// Integer Types. // XXX need to use /** @brief 16-bit signed integer. */ typedef short int16; /** @brief 16-bit unsigned integer. */ typedef unsigned short uint16; /** @brief 32-bit signed integer. */ typedef int int32; /** @brief 32-bit unsigned integer. */ typedef unsigned int uint32; /** @brief 64-bit signed integer. */ typedef long long int64; /** @brief 64-bit unsigned integer. */ typedef unsigned long long uint64; /** * @brief Unsigned integer to index elements. * The total number of elements for each algorithm must fit into this type. */ typedef uint64 sequence_index_t; /** * @brief Unsigned integer to index a thread number. * The maximum thread number (for each processor) must fit into this type. */ typedef uint16 thread_index_t; // XXX atomics interface? /// Longest compare-and-swappable integer type on this platform. typedef int64 lcas_t; // XXX numeric_limits::digits? /// Number of bits of ::lcas_t. static const int lcas_t_bits = sizeof(lcas_t) * 8; /// ::lcas_t with the right half of bits set to 1. static const lcas_t lcas_t_mask = ((lcas_t(1) << (lcas_t_bits / 2)) - 1); } #endif /* _GLIBCXX_TYPES_H */ // -*- C++ -*- // Copyright (C) 2007, 2008 Free Software Foundation, Inc. // // This file is part of the GNU ISO C++ Library. This library is free // software; you can redistribute it and/or modify it under the terms // of the GNU General Public License as published by the Free Software // Foundation; either version 2, or (at your option) any later // version. // This library is distributed in the hope that it will be useful, but // WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // General Public License for more details. // You should have received a copy of the GNU General Public License // along with this library; see the file COPYING. If not, write to // the Free Software Foundation, 59 Temple Place - Suite 330, Boston, // MA 02111-1307, USA. // As a special exception, you may use this file as part of a free // software library without restriction. Specifically, if other files // instantiate templates or use macros or inline functions from this // file, or you compile this file and link it with other files to // produce an executable, this file does not by itself cause the // resulting executable to be covered by the GNU General Public // License. This exception does not however invalidate any other // reasons why the executable file might be covered by the GNU General // Public License. /** @file parallel/iterator.h * @brief Helper iterator classes for the std::transform() functions. * This file is a GNU parallel extension to the Standard C++ Library. */ // Written by Johannes Singler. #ifndef _GLIBCXX_PARALLEL_ITERATOR_H #define _GLIBCXX_PARALLEL_ITERATOR_H 1 #include #include namespace __gnu_parallel { /** @brief A pair of iterators. The usual iterator operations are * applied to both child iterators. */ template class iterator_pair : public std::pair { private: typedef iterator_pair type; typedef std::pair base_type; public: typedef IteratorCategory iterator_category; typedef void value_type; typedef std::iterator_traits traits_type; typedef typename traits_type::difference_type difference_type; typedef type* pointer; typedef type& reference; iterator_pair() { } iterator_pair(const Iterator1& first, const Iterator2& second) : base_type(first, second) { } // Pre-increment operator. type& operator++() { ++base_type::first; ++base_type::second; return *this; } // Post-increment operator. const type operator++(int) { return type(base_type::first++, base_type::second++); } // Pre-decrement operator. type& operator--() { --base_type::first; --base_type::second; return *this; } // Post-decrement operator. const type operator--(int) { return type(base_type::first--, base_type::second--); } // Type conversion. operator Iterator2() const { return base_type::second; } type& operator=(const type& other) { base_type::first = other.first; base_type::second = other.second; return *this; } type operator+(difference_type delta) const { return type(base_type::first + delta, base_type::second + delta); } difference_type operator-(const type& other) const { return base_type::first - other.first; } }; /** @brief A triple of iterators. The usual iterator operations are applied to all three child iterators. */ template class iterator_triple { private: typedef iterator_triple type; public: typedef IteratorCategory iterator_category; typedef void value_type; typedef typename std::iterator_traits::difference_type difference_type; typedef type* pointer; typedef type& reference; Iterator1 first; Iterator2 second; Iterator3 third; iterator_triple() { } iterator_triple(const Iterator1& _first, const Iterator2& _second, const Iterator3& _third) { first = _first; second = _second; third = _third; } // Pre-increment operator. type& operator++() { ++first; ++second; ++third; return *this; } // Post-increment operator. const type operator++(int) { return type(first++, second++, third++); } // Pre-decrement operator. type& operator--() { --first; --second; --third; return *this; } // Post-decrement operator. const type operator--(int) { return type(first--, second--, third--); } // Type conversion. operator Iterator3() const { return third; } type& operator=(const type& other) { first = other.first; second = other.second; third = other.third; return *this; } type operator+(difference_type delta) const { return type(first + delta, second + delta, third + delta); } difference_type operator-(const type& other) const { return first - other.first; } }; } #endif // -*- C++ -*- // Copyright (C) 2007, 2008 Free Software Foundation, Inc. // // This file is part of the GNU ISO C++ Library. This library is free // software; you can redistribute it and/or modify it under the terms // of the GNU General Public License as published by the Free Software // Foundation; either version 2, or (at your option) any later // version. // This library is distributed in the hope that it will be useful, but // WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // General Public License for more details. // You should have received a copy of the GNU General Public License // along with this library; see the file COPYING. If not, write to // the Free Software Foundation, 59 Temple Place - Suite 330, Boston, // MA 02111-1307, USA. // As a special exception, you may use this file as part of a free // software library without restriction. Specifically, if other files // instantiate templates or use macros or