// -*- C++ -*- // Copyright (C) 2007, 2008 Free Software Foundation, Inc. // // This file is part of the GNU ISO C++ Library. This library is free // software; you can redistribute it and/or modify it under the terms // of the GNU General Public License as published by the Free Software // Foundation; either version 2, or (at your option) any later // version. // This library is distributed in the hope that it will be useful, but // WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // General Public License for more details. // You should have received a copy of the GNU General Public License // along with this library; see the file COPYING. If not, write to // the Free Software Foundation, 59 Temple Place - Suite 330, Boston, // MA 02111-1307, USA. // As a special exception, you may use this file as part of a free // software library without restriction. Specifically, if other files // instantiate templates or use macros or inline functions from this // file, or you compile this file and link it with other files to // produce an executable, this file does not by itself cause the // resulting executable to be covered by the GNU General Public // License. This exception does not however invalidate any other // reasons why the executable file might be covered by the GNU General // Public License. /** @file parallel/multiseq_selection.h * @brief Functions to find elements of a certain global rank in * multiple sorted sequences. Also serves for splitting such * sequence sets. * * The algorithm description can be found in * * P. J. Varman, S. D. Scheufler, B. R. Iyer, and G. R. Ricard. * Merging Multiple Lists on Hierarchical-Memory Multiprocessors. * Journal of Parallel and Distributed Computing, 12(2):171–177, 1991. * * This file is a GNU parallel extension to the Standard C++ Library. */ // Written by Johannes Singler. #ifndef _GLIBCXX_PARALLEL_MULTISEQ_SELECTION_H #define _GLIBCXX_PARALLEL_MULTISEQ_SELECTION_H 1 #include #include #include #include namespace __gnu_parallel { /** @brief Compare a pair of types lexicographically, ascending. */ template class lexicographic : public std::binary_function, std::pair, bool> { private: Comparator& comp; public: lexicographic(Comparator& _comp) : comp(_comp) { } bool operator()(const std::pair& p1, const std::pair& p2) const { if (comp(p1.first, p2.first)) return true; if (comp(p2.first, p1.first)) return false; // Firsts are equal. return p1.second < p2.second; } }; /** @brief Compare a pair of types lexicographically, descending. */ template class lexicographic_reverse : public std::binary_function { private: Comparator& comp; public: lexicographic_reverse(Comparator& _comp) : comp(_comp) { } bool operator()(const std::pair& p1, const std::pair& p2) const { if (comp(p2.first, p1.first)) return true; if (comp(p1.first, p2.first)) return false; // Firsts are equal. return p2.second < p1.second; } }; /** * @brief Splits several sorted sequences at a certain global rank, * resulting in a splitting point for each sequence. * The sequences are passed via a sequence of random-access * iterator pairs, none of the sequences may be empty. If there * are several equal elements across the split, the ones on the * left side will be chosen from sequences with smaller number. * @param begin_seqs Begin of the sequence of iterator pairs. * @param end_seqs End of the sequence of iterator pairs. * @param rank The global rank to partition at. * @param begin_offsets A random-access sequence begin where the * result will be stored in. Each element of the sequence is an * iterator that points to the first element on the greater part of * the respective sequence. * @param comp The ordering functor, defaults to std::less. */ template void multiseq_partition(RanSeqs begin_seqs, RanSeqs end_seqs, RankType rank, RankIterator begin_offsets, Comparator comp = std::less< typename std::iterator_traits::value_type:: first_type>::value_type>()) // std::less { _GLIBCXX_CALL(end_seqs - begin_seqs) typedef typename std::iterator_traits::value_type::first_type It; typedef typename std::iterator_traits::difference_type difference_type; typedef typename std::iterator_traits::value_type value_type; lexicographic lcomp(comp); lexicographic_reverse lrcomp(comp); // Number of sequences, number of elements in total (possibly // including padding). difference_type m = std::distance(begin_seqs, end_seqs), N = 0, nmax, n, r; for (int i = 0; i < m; i++) { N += std::distance(begin_seqs[i].first, begin_seqs[i].second); _GLIBCXX_PARALLEL_ASSERT( std::distance(begin_seqs[i].first, begin_seqs[i].second) > 0); } if (rank == N) { for (int i = 0; i < m; i++) begin_offsets[i] = begin_seqs[i].second; // Very end. // Return m - 1; return; } _GLIBCXX_PARALLEL_ASSERT(m != 0); _GLIBCXX_PARALLEL_ASSERT(N != 0); _GLIBCXX_PARALLEL_ASSERT(rank >= 0); _GLIBCXX_PARALLEL_ASSERT(rank < N); difference_type* ns = new difference_type[m]; difference_type* a = new difference_type[m]; difference_type* b = new difference_type[m]; difference_type l; ns[0] = std::distance(begin_seqs[0].first, begin_seqs[0].second); nmax = ns[0]; for (int i = 0; i < m; i++) { ns[i] = std::distance(begin_seqs[i].first, begin_seqs[i].second); nmax = std::max(nmax, ns[i]); } r = log2(nmax) + 1; // Pad all lists to this length, at least as long as any ns[i], // equality iff nmax = 2^k - 1. l = (1ULL << r) - 1; // From now on, including padding. N = l * m; for (int i = 0; i < m; i++) { a[i] = 0; b[i] = l; } n = l / 2; // Invariants: // 0 <= a[i] <= ns[i], 0 <= b[i] <= l #define S(i) (begin_seqs[i].first) // Initial partition. std::vector > sample; for (int i = 0; i < m; i++) if (n < ns[i]) //sequence long enough sample.push_back(std::make_pair(S(i)[n], i)); __gnu_sequential::sort(sample.begin(), sample.end(), lcomp); for (int i = 0; i < m; i++) //conceptual infinity if (n >= ns[i]) //sequence too short, conceptual infinity sample.push_back(std::make_pair(S(i)[0] /*dummy element*/, i)); difference_type localrank = rank * m / N ; int j; for (j = 0; j < localrank && ((n + 1) <= ns[sample[j].second]); ++j) a[sample[j].second] += n + 1; for (; j < m; j++) b[sample[j].second] -= n + 1; // Further refinement. while (n > 0) { n /= 2; int lmax_seq = -1; // to avoid warning const value_type* lmax = NULL; // impossible to avoid the warning? for (int i = 0; i < m; i++) { if (a[i] > 0) { if (!lmax) { lmax = &(S(i)[a[i] - 1]); lmax_seq = i; } else { // Max, favor rear sequences. if (!comp(S(i)[a[i] - 1], *lmax)) { lmax = &(S(i)[a[i] - 1]); lmax_seq = i; } } } } int i; for (i = 0; i < m; i++) { difference_type middle = (b[i] + a[i]) / 2; if (lmax && middle < ns[i] && lcomp(std::make_pair(S(i)[middle], i), std::make_pair(*lmax, lmax_seq))) a[i] = std::min(a[i] + n + 1, ns[i]); else b[i] -= n + 1; } difference_type leftsize = 0, total = 0; for (int i = 0; i < m; i++) { leftsize += a[i] / (n + 1); total += l / (n + 1); } difference_type skew = static_cast (static_cast(total) * rank / N - leftsize); if (skew > 0) { // Move to the left, find smallest. std::priority_queue, std::vector >, lexicographic_reverse > pq(lrcomp); for (int i = 0; i < m; i++) if (b[i] < ns[i]) pq.push(std::make_pair(S(i)[b[i]], i)); for (; skew != 0 && !pq.empty(); --skew) { int source = pq.top().second; pq.pop(); a[source] = std::min(a[source] + n + 1, ns[source]); b[source] += n + 1; if (b[source] < ns[source]) pq.push(std::make_pair(S(source)[b[source]], source)); } } else if (skew < 0) { // Move to the right, find greatest. std::priority_queue, std::vector >, lexicographic > pq(lcomp); for (int i = 0; i < m; i++) if (a[i] > 0) pq.push(std::make_pair(S(i)[a[i] - 1], i)); for (; skew != 0; ++skew) { int source = pq.top().second; pq.pop(); a[source] -= n + 1; b[source] -= n + 1; if (a[source] > 0) pq.push(std::make_pair(S(source)[a[source] - 1], source)); } } } // Postconditions: // a[i] == b[i] in most cases, except when a[i] has been clamped // because of having reached the boundary // Now return the result, calculate the offset. // Compare the keys on both edges of the border. // Maximum of left edge, minimum of right edge. value_type* maxleft = NULL; value_type* minright = NULL; for (int i = 0; i < m; i++) { if (a[i] > 0) { if (!maxleft) maxleft = &(S(i)[a[i] - 1]); else { // Max, favor rear sequences. if (!comp(S(i)[a[i] - 1], *maxleft)) maxleft = &(S(i)[a[i] - 1]); } } if (b[i] < ns[i]) { if (!minright) minright = &(S(i)[b[i]]); else { // Min, favor fore sequences. if (comp(S(i)[b[i]], *minright)) minright = &(S(i)[b[i]]); } } } int seq = 0; for (int i = 0; i < m; i++) begin_offsets[i] = S(i) + a[i]; delete[] ns; delete[] a; delete[] b; } /** * @brief Selects the element at a certain global rank from several * sorted sequences. * * The sequences are passed via a sequence of random-access * iterator pairs, none of the sequences may be empty. * @param begin_seqs Begin of the sequence of iterator pairs. * @param end_seqs End of the sequence of iterator pairs. * @param rank The global rank to partition at. * @param offset The rank of the selected element in the global * subsequence of elements equal to the selected element. If the * selected element is unique, this number is 0. * @param comp The ordering functor, defaults to std::less. */ template T multiseq_selection(RanSeqs begin_seqs, RanSeqs end_seqs, RankType rank, RankType& offset, Comparator comp = std::less()) { _GLIBCXX_CALL(end_seqs - begin_seqs) typedef typename std::iterator_traits::value_type::first_type It; typedef typename std::iterator_traits::difference_type difference_type; lexicographic lcomp(comp); lexicographic_reverse lrcomp(comp); // Number of sequences, number of elements in total (possibly // including padding). difference_type m = std::distance(begin_seqs, end_seqs); difference_type N = 0; difference_type nmax, n, r; for (int i = 0; i < m; i++) N += std::distance(begin_$Ž$$$‘$seqs[i].first, begin_seqs[i].second); if (m == 0 || N == 0 || rank < 0 || rank >= N) { // Result undefined when there is no data or rank is outside bounds. throw std::exception(); } difference_type* ns = new difference_type[m]; difference_type* a = new difference_type[m]; difference_type* b = new difference_type[m]; difference_type l; ns[0] = std::distance(begin_seqs[0].first, begin_seqs[0].second); nmax = ns[0]; for (int i = 0; i < m; ++i) { ns[i] = std::distance(begin_seqs[i].first, begin_seqs[i].second); nmax = std::max(nmax, ns[i]); } r = log2(nmax) + 1; // Pad all lists to this length, at least as long as any ns[i], // equality iff nmax = 2^k - 1 l = pow2(r) - 1; // From now on, including padding. N = l * m; for (int i = 0; i < m; ++i) { a[i] = 0; b[i] = l; } n = l / 2; // Invariants: // 0 <= a[i] <= ns[i], 0 <= b[i] <= l #define S(i) (begin_seqs[i].first) // Initial partition. std::vector > sample; for (int i = 0; i < m; i++) if (n < ns[i]) sample.push_back(std::make_pair(S(i)[n], i)); __gnu_sequential::sort(sample.begin(), sample.end(), lcomp, sequential_tag()); // Conceptual infinity. for (int i = 0; i < m; i++) if (n >= ns[i]) sample.push_back(std::make_pair(S(i)[0] /*dummy element*/, i)); difference_type localrank = rank * m / N ; int j; for (j = 0; j < localrank && ((n + 1) <= ns[sample[j].second]); ++j) a[sample[j].second] += n + 1; for (; j < m; ++j) b[sample[j].second] -= n + 1; // Further refinement. while (n > 0) { n /= 2; const T* lmax = NULL; for (int i = 0; i < m; ++i) { if (a[i] > 0) { if (!lmax) lmax = &(S(i)[a[i] - 1]); else { if (comp(*lmax, S(i)[a[i] - 1])) //max lmax = &(S(i)[a[i] - 1]); } } } int i; for (i = 0; i < m; i++) { difference_type middle = (b[i] + a[i]) / 2; if (lmax && middle < ns[i] && comp(S(i)[middle], *lmax)) a[i] = std::min(a[i] + n + 1, ns[i]); else b[i] -= n + 1; } difference_type leftsize = 0, total = 0; for (int i = 0; i < m; ++i) { leftsize += a[i] / (n + 1); total += l / (n + 1); } difference_type skew = ((unsigned long long)total * rank / N - leftsize); if (skew > 0) { // Move to the left, find smallest. std::priority_queue, std::vector >, lexicographic_reverse > pq(lrcomp); for (int i = 0; i < m; ++i) if (b[i] < ns[i]) pq.push(std::make_pair(S(i)[b[i]], i)); for (; skew != 0 && !pq.empty(); --skew) { int source = pq.top().second; pq.pop(); a[source] = std::min(a[source] + n + 1, ns[source]); b[source] += n + 1; if (b[source] < ns[source]) pq.push(std::make_pair(S(source)[b[source]], source)); } } else if (skew < 0) { // Move to the right, find greatest. std::priority_queue, std::vector >, lexicographic > pq(lcomp); for (int i = 0; i < m; ++i) if (a[i] > 0) pq.push(std::make_pair(S(i)[a[i] - 1], i)); for (; skew != 0; ++skew) { int source = pq.top().second; pq.pop(); a[source] -= n + 1; b[source] -= n + 1; if (a[source] > 0) pq.push(std::make_pair(S(source)[a[source] - 1], source)); } } } // Postconditions: // a[i] == b[i] in most cases, except when a[i] has been clamped // because of having reached the boundary // Now return the result, calculate the offset. // Compare the keys on both edges of the border. // Maximum of left edge, minimum of right edge. bool maxleftset = false, minrightset = false; // Impossible to avoid the warning? T maxleft, minright; for (int i = 0; i < m; ++i) { if (a[i] > 0) { if (!maxleftset) { maxleft = S(i)[a[i] - 1]; maxleftset = true; } else { // Max. if (comp(maxleft, S(i)[a[i] - 1])) maxleft = S(i)[a[i] - 1]; } } if (b[i] < ns[i]) { if (!minrightset) { minright = S(i)[b[i]]; minrightset = true; } else { // Min. if (comp(S(i)[b[i]], minright)) minright = S(i)[b[i]]; } } } // Minright is the splitter, in any case. if (!maxleftset || comp(minright, maxleft)) { // Good luck, everything is split unambiguously. offset = 0; } else { // We have to calculate an offset. offset = 0; for (int i = 0; i < m; ++i) { difference_type lb = std::lower_bound(S(i), S(i) + ns[i], minright, comp) - S(i); offset += a[i] - lb; } } delete[] ns; delete[] a; delete[] b; return minright; } } #undef S #endif // -*- C++ -*- // Copyright (C) 2007, 2008 Free Software Foundation, Inc. // // This file is part of the GNU ISO C++ Library. This library is free // software; you can redistribute it and/or modify it under the terms // of the GNU General Public License as published by the Free Software // Foundation; either version 2, or (at your option) any later // version. // This library is distributed in the hope that it will be useful, but // WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // General Public License for more details. // You should have received a copy of the GNU General Public License // along with this library; see the file COPYING. If not, write to // the Free Software Foundation, 59 Temple Place - Suite 330, Boston, // MA 02111-1307, USA. // As a special exception, you may use this file as part of a free // software library without restriction. Specifically, if other files // instantiate templates or use macros or inline functions from this // file, or you compile this file and link it with other files to // produce an executable, this file does not by itself cause the // resulting executable to be covered by the GNU General Public // License. This exception does not however invalidate any other // reasons why the executable file might be covered by the GNU General // Public License. /** @file parallel/find.h * @brief Parallel implementation base for std::find(), std::equal() * and related functions. * This file is a GNU parallel extension to the Standard C++ Library. */ // Written by Felix Putze and Johannes Singler. #ifndef _GLIBCXX_PARALLEL_FIND_H #define _GLIBCXX_PARALLEL_FIND_H 1 #include #include #include #include #include namespace __gnu_parallel { /** * @brief Parallel std::find, switch for different algorithms. * @param begin1 Begin iterator of first sequence. * @param end1 End iterator of first sequence. * @param begin2 Begin iterator of second sequence. Must have same * length as first sequence. * @param pred Find predicate. * @param selector Functionality (e. g. std::find_if (), std::equal(),...) * @return Place of finding in both sequences. */ template inline std::pair find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1, RandomAccessIterator2 begin2, Pred pred, Selector selector) { switch (_Settings::get().find_algorithm) { case GROWING_BLOCKS: return find_template(begin1, end1, begin2, pred, selector, growing_blocks_tag()); case CONSTANT_SIZE_BLOCKS: return find_template(begin1, end1, begin2, pred, selector, constant_size_blocks_tag()); case EQUAL_SPLIT: return find_template(begin1, end1, begin2, pred, selector, equal_split_tag()); default: _GLIBCXX_PARALLEL_ASSERT(false); return std::make_pair(begin1, begin2); } } #if _GLIBCXX_FIND_EQUAL_SPLIT /** * @brief Parallel std::find, equal splitting variant. * @param begin1 Begin iterator of first sequence. * @param end1 End iterator of first sequence. * @param begin2 Begin iterator of second sequence. Second sequence * must have same length as first sequence. * @param pred Find predicate. * @param selector Functionality (e. g. std::find_if (), std::equal(),...) * @return Place of finding in both sequences. */ template std::pair find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1, RandomAccessIterator2 begin2, Pred pred, Selector selector, equal_split_tag) { _GLIBCXX_CALL(end1 - begin1) typedef std::iterator_traits traits_type; typedef typename traits_type::difference_type difference_type; typedef typename traits_type::value_type value_type; difference_type length = end1 - begin1; difference_type result = length; difference_type* borders; omp_lock_t result_lock; omp_init_lock(&result_lock); thread_index_t num_threads = get_max_threads(); # pragma omp parallel num_threads(num_threads) { # pragma omp single { num_threads = omp_get_num_threads(); borders = new difference_type[num_threads + 1]; equally_split(length, num_threads, borders); } //single thread_index_t iam = omp_get_thread_num(); difference_type start = borders[iam], stop = borders[iam + 1]; RandomAccessIterator1 i1 = begin1 + start; RandomAccessIterator2 i2 = begin2 + start; for (difference_type pos = start; pos < stop; ++pos) { #pragma omp flush(result) // Result has been set to something lower. if (result < pos) break; if (selector(i1, i2, pred)) { omp_set_lock(&result_lock); if (pos < result) result = pos; omp_unset_lock(&result_lock); break; } ++i1; ++i2; } } //parallel omp_destroy_lock(&result_lock); delete[] borders; return std::pair(begin1 + result, begin2 + result); } #endif #if _GLIBCXX_FIND_GROWING_BLOCKS /** * @brief Parallel std::find, growing block size variant. * @param begin1 Begin iterator of first sequence. * @param end1 End iterator of first sequence. * @param begin2 Begin iterator of second sequence. Second sequence * must have same length as first sequence. * @param pred Find predicate. * @param selector Functionality (e. g. std::find_if (), std::equal(),...) * @return Place of finding in both sequences. * @see __gnu_parallel::_Settings::find_sequential_search_size * @see __gnu_parallel::_Settings::find_initial_block_size * @see __gnu_parallel::_Settings::find_maximum_block_size * @see __gnu_parallel::_Settings::find_increasing_factor * * There are two main differences between the growing blocks and * the constant-size blocks variants. * 1. For GB, the block size grows; for CSB, the block size is fixed. * 2. For GB, the blocks are allocated dynamically; * for CSB, the blocks are allocated in a predetermined manner, * namely spacial round-robin. */ template std::pair find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1, RandomAccessIterator2 begin2, Pred pred, Selector selector, growing_blocks_tag) { _GLIBCXX_CALL(end1 - begin1) typedef std::iterator_traits traits_type; typedef typename traits_type::difference_type difference_type; typedef typename traits_type::value_type value_type; const _Settings& __s = _Settings::get(); difference_type length = end1 - begin1; difference_type sequential_search_size = std::min(length, __s.find_sequential_search_size); // Try it sequentially first. std::pair find_seq_result = selector.sequential_algorithm( begin1, begin1 + sequential_search_size, begin2, pred); if (find_seq_result.first != (begin1 + sequential_search_size)) return find_seq_result; // Index of beginning of next free block (after sequential find). difference_type next_block_start = sequential_search_size; difference_type result = length; omp_lock_t result_lock; omp_init_lock(&result_lock); thread_index_t num_threads = get_max_threads(); # pragma omp parallel shared(result) num_threads(num_threads) { # pragma omp single num_threads = omp_get_num_threads(); // Not within first k elements -> start parallel. thread_index_t iam = omp_get_thread_num(); difference_type block_size = __s.find_initial_block_size; difference_type start = fetch_and_add(&next_block_start, block_size); // Get new block, update pointer to next block. difference_type stop = std::min(length, start + block_size); std::pair local_result; while (start < length) { # pragma omp flush(result) // Get new value of result. if (result < start) { // No chance to find first element. break; } local_result = selector.sequential_algorithm( begin1 + start, begin1 + stop, begin2 + start, pred); if (local_result.first != (begin1 + stop)) { omp_set_lock(&result_lock); if ((local_result.first - begin1) < result) { result = local_result.first - begin1; // Result cannot be in future blocks, stop algorithm. fetch_and_add(&next_block_start, length); } omp_unset_lock(&result_lock); } block_size = std::min(block_size * __s.find_increasing_factor, __s.find_maximum_block_size); // Get new block, update pointer to next block. start = fetch_and_add(&next_block_start, block_size); stop = ((length < (start + block_size)) ? length : (start + block_size)); } } //parallel omp_destroy_lock(&result_lock); // Return iterator on found element. return std::pair(begin1 + result, begin2 + result); } #endif #if _GLIBCXX_FIND_CONSTANT_SIZE_BLOCKS /** * @brief Parallel std::find, constant block size variant. * @param begin1 Begin iterator of first sequence. * @param end1 End iterator of first sequence. * @param begin2 Begin iterator of second sequence. Second sequence * must have same length as first sequence. * @param pred Find predicate. * @param selector Functionality (e. g. std::find_if (), std::equal(),...) * @return Place of finding in both sequences. * @see __gnu_parallel::_Settings::find_sequential_search_size * @see __gnu_parallel::_Settings::find_block_size * There are two main differences between the growing blocks and the * constant-size blocks variants. * 1. For GB, the block size grows; for CSB, the block size is fixed. * 2. For GB, the blocks are allocated dynamically; for CSB, the * blocks are allocated in a predetermined manner, namely spacial * round-robin. */ template std::pair find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1, RandomAccessIterator2 begin2, Pred pred, Selector selector, constant_size_blocks_tag) { _GLIBCXX_CALL(end1 - begin1) typedef std::iterator_traits traits_type; typedef typename traits_type::difference_type difference_type; typedef typename traits_type::value_type value_type; const _Settings& __s = _Settings::get(); difference_type length = end1 - begin1; difference_type sequential_search_size = std::min( length, __s.find_sequential_search_size); // Try it sequentially first. std::pair find_seq_result = selector.sequential_algorithm(begin1, begŸ$ $¡$in1 + sequential_search_size, begin2, pred); if (find_seq_result.first != (begin1 + sequential_search_size)) return find_seq_result; difference_type result = length; omp_lock_t result_lock; omp_init_lock(&result_lock); // Not within first sequential_search_size elements -> start parallel. thread_index_t num_threads = get_max_threads(); # pragma omp parallel shared(result) num_threads(num_threads) { # pragma omp single num_threads = omp_get_num_threads(); thread_index_t iam = omp_get_thread_num(); difference_type block_size = __s.find_initial_block_size; // First element of thread's current iteration. difference_type iteration_start = sequential_search_size; // Where to work (initialization). difference_type start = iteration_start + iam * block_size; difference_type stop = std::min(length, start + block_size); std::pair local_result; while (start < length) { // Get new value of result. # pragma omp flush(result) // No chance to find first element. if (result < start) break; local_result = selector.sequential_algorithm( begin1 + start, begin1 + stop, begin2 + start, pred); if (local_result.first != (begin1 + stop)) { omp_set_lock(&result_lock); if ((local_result.first - begin1) < result) result = local_result.first - begin1; omp_unset_lock(&result_lock); // Will not find better value in its interval. break; } iteration_start += num_threads * block_size; // Where to work. start = iteration_start + iam * block_size; stop = std::min(length, start + block_size); } } //parallel omp_destroy_lock(&result_lock); // Return iterator on found element. return std::pair(begin1 + result, begin2 + result); } #endif } // end namespace #endif // -*- C++ -*- // Copyright (C) 2007, 2008 Free Software Foundation, Inc. // // This file is part of the GNU ISO C++ Library. This library is free // software; you can redistribute it and/or modify it under the terms // of the GNU General Public License as published by the Free Software // Foundation; either version 2, or (at your option) any later // version. // This library is distributed in the hope that it will be useful, but // WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // General Public License for more details. // You should have received a copy of the GNU General Public License // along with this library; see the file COPYING. If not, write to // the Free Software Foundation, 59 Temple Place - Suite 330, Boston, // MA 02111-1307, USA. // As a special exception, you may use this file as part of a free // software library without restriction. Specifically, if other files // instantiate templates or use macros or inline functions from this // file, or you compile this file and link it with other files to // produce an executable, this file does not by itself cause the // resulting executable to be covered by the GNU General Public // License. This exception does not however invalidate any other // reasons why the executable file might be covered by the GNU General // Public License. /** @file parallel/compatibility.h * @brief Compatibility layer, mostly concerned with atomic operations. * This file is a GNU parallel extension to the Standard C++ Library. */ // Written by Felix Putze. #ifndef _GLIBCXX_PARALLEL_COMPATIBILITY_H #define _GLIBCXX_PARALLEL_COMPATIBILITY_H 1 #include #include #if defined(__SUNPRO_CC) && defined(__sparc) #include #endif #if !defined(_WIN32) || defined (__CYGWIN__) #include #endif #if defined(_MSC_VER) #include #include #undef max #undef min #endif #ifdef __MINGW32__ // Including will drag in all the windows32 names. Since // that can cause user code portability problems, we just declare the // one needed function here. extern "C" __attribute((dllimport)) void __attribute__((stdcall)) Sleep (unsigned long); #endif namespace __gnu_parallel { #if defined(__ICC) template int32 faa32(int32* x, int32 inc) { asm volatile("lock xadd %0,%1" : "=r" (inc), "=m" (*x) : "0" (inc) : "memory"); return inc; } #if defined(__x86_64) template int64 faa64(int64* x, int64 inc) { asm volatile("lock xadd %0,%1" : "=r" (inc), "=m" (*x) : "0" (inc) : "memory"); return inc; } #endif #endif // atomic functions only work on integers /** @brief Add a value to a variable, atomically. * * Implementation is heavily platform-dependent. * @param ptr Pointer to a 32-bit signed integer. * @param addend Value to add. */ inline int32 fetch_and_add_32(volatile int32* ptr, int32 addend) { #if defined(__ICC) //x86 version return _InterlockedExchangeAdd((void*)ptr, addend); #elif defined(__ECC) //IA-64 version return _InterlockedExchangeAdd((void*)ptr, addend); #elif defined(__ICL) || defined(_MSC_VER) return _InterlockedExchangeAdd(reinterpret_cast(ptr), addend); #elif defined(__GNUC__) return __sync_fetch_and_add(ptr, addend); #elif defined(__SUNPRO_CC) && defined(__sparc) volatile int32 before, after; do { before = *ptr; after = before + addend; } while (atomic_cas_32((volatile unsigned int*)ptr, before, after) != before); return before; #else //fallback, slow #pragma message("slow fetch_and_add_32") int32 res; #pragma omp critical { res = *ptr; *(ptr) += addend; } return res; #endif } /** @brief Add a value to a variable, atomically. * * Implementation is heavily platform-dependent. * @param ptr Pointer to a 64-bit signed integer. * @param addend Value to add. */ inline int64 fetch_and_add_64(volatile int64* ptr, int64 addend) { #if defined(__ICC) && defined(__x86_64) //x86 version return faa64((int64*)ptr, addend); #elif defined(__ECC) //IA-64 version return _InterlockedExchangeAdd64((void*)ptr, addend); #elif defined(__ICL) || defined(_MSC_VER) #ifndef _WIN64 _GLIBCXX_PARALLEL_ASSERT(false); //not available in this case return 0; #else return _InterlockedExchangeAdd64(ptr, addend); #endif #elif defined(__GNUC__) && defined(__x86_64) return __sync_fetch_and_add(ptr, addend); #elif defined(__GNUC__) && defined(__i386) && \ (defined(__i686) || defined(__pentium4) || defined(__athlon)) return __sync_fetch_and_add(ptr, addend); #elif defined(__SUNPRO_CC) && defined(__sparc) volatile int64 before, after; do { before = *ptr; after = before + addend; } while (atomic_cas_64((volatile unsigned long long*)ptr, before, after) != before); return before; #else //fallback, slow #if defined(__GNUC__) && defined(__i386) // XXX doesn't work with -march=native //#warning "please compile with -march=i686 or better" #endif #pragma message("slow fetch_and_add_64") int64 res; #pragma omp critical { res = *ptr; *(ptr) += addend; } return res; #endif } /** @brief Add a value to a variable, atomically. * * Implementation is heavily platform-dependent. * @param ptr Pointer to a signed integer. * @param addend Value to add. */ template inline T fetch_and_add(volatile T* ptr, T addend) { if (sizeof(T) == sizeof(int32)) return (T)fetch_and_add_32((volatile int32*) ptr, (int32)addend); else if (sizeof(T) == sizeof(int64)) return (T)fetch_and_add_64((volatile int64*) ptr, (int64)addend); else _GLIBCXX_PARALLEL_ASSERT(false); } #if defined(__ICC) template inline int32 cas32(volatile int32* ptr, int32 old, int32 nw) { int32 before; __asm__ __volatile__("lock; cmpxchgl %1,%2" : "=a"(before) : "q"(nw), "m"(*(volatile long long*)(ptr)), "0"(old) : "memory"); return before; } #if defined(__x86_64) template inline int64 cas64(volatile int64 *ptr, int64 old, int64 nw) { int64 before; __asm__ __volatile__("lock; cmpxchgq %1,%2" : "=a"(before) : "q"(nw), "m"(*(volatile long long*)(ptr)), "0"(old) : "memory"); return before; } #endif #endif /** @brief Compare @c *ptr and @c comparand. If equal, let @c * *ptr=replacement and return @c true, return @c false otherwise. * * Implementation is heavily platform-dependent. * @param ptr Pointer to 32-bit signed integer. * @param comparand Compare value. * @param replacement Replacement value. */ inline bool compare_and_swap_32(volatile int32* ptr, int32 comparand, int32 replacement) { #if defined(__ICC) //x86 version return _InterlockedCompareExchange((void*)ptr, replacement, comparand) == comparand; #elif defined(__ECC) //IA-64 version return _InterlockedCompareExchange((void*)ptr, replacement, comparand) == comparand; #elif defined(__ICL) || defined(_MSC_VER) return _InterlockedCompareExchange(reinterpret_cast(ptr), replacement, comparand) == comparand; #elif defined(__GNUC__) return __sync_bool_compare_and_swap(ptr, comparand, replacement); #elif defined(__SUNPRO_CC) && defined(__sparc) return atomic_cas_32((volatile unsigned int*)ptr, comparand, replacement) == comparand; #else #pragma message("slow compare_and_swap_32") bool res = false; #pragma omp critical { if (*ptr == comparand) { *ptr = replacement; res = true; } } return res; #endif } /** @brief Compare @c *ptr and @c comparand. If equal, let @c * *ptr=replacement and return @c true, return @c false otherwise. * * Implementation is heavily platform-dependent. * @param ptr Pointer to 64-bit signed integer. * @param comparand Compare value. * @param replacement Replacement value. */ inline bool compare_and_swap_64(volatile int64* ptr, int64 comparand, int64 replacement) { #if defined(__ICC) && defined(__x86_64) //x86 version return cas64(ptr, comparand, replacement) == comparand; #elif defined(__ECC) //IA-64 version return _InterlockedCompareExchange64((void*)ptr, replacement, comparand) == comparand; #elif defined(__ICL) || defined(_MSC_VER) #ifndef _WIN64 _GLIBCXX_PARALLEL_ASSERT(false); //not available in this case return 0; #else return _InterlockedCompareExchange64(ptr, replacement, comparand) == comparand; #endif #elif defined(__GNUC__) && defined(__x86_64) return __sync_bool_compare_and_swap(ptr, comparand, replacement); #elif defined(__GNUC__) && defined(__i386) && \ (defined(__i686) || defined(__pentium4) || defined(__athlon)) return __sync_bool_compare_and_swap(ptr, comparand, replacement); #elif defined(__SUNPRO_CC) && defined(__sparc) return atomic_cas_64((volatile unsigned long long*)ptr, comparand, replacement) == comparand; #else #if defined(__GNUC__) && defined(__i386) // XXX -march=native //#warning "please compile with -march=i686 or better" #endif #pragma message("slow compare_and_swap_64") bool res = false; #pragma omp critical { if (*ptr == comparand) { *ptr = replacement; res = true; } } return res; #endif } /** @brief Compare @c *ptr and @c comparand. If equal, let @c * *ptr=replacement and return @c true, return @c false otherwise. * * Implementation is heavily platform-dependent. * @param ptr Pointer to signed integer. * @param comparand Compare value. * @param replacement Replacement value. */ template inline bool compare_and_swap(volatile T* ptr, T comparand, T replacement) { if (sizeof(T) == sizeof(int32)) return compare_and_swap_32((volatile int32*) ptr, (int32)comparand, (int32)replacement); else if (sizeof(T) == sizeof(int64)) return compare_and_swap_64((volatile int64*) ptr, (int64)comparand, (int64)replacement); else _GLIBCXX_PARALLEL_ASSERT(false); } /** @brief Yield the control to another thread, without waiting for the end to the time slice. */ inline void yield() { #if defined (_WIN32) && !defined (__CYGWIN__) Sleep(0); #else sched_yield(); #endif } } // end namespace #endif // -*- C++ -*- // Copyright (C) 2007, 2008 Free Software Foundation, Inc. // // This file is part of the GNU ISO C++ Library. This library is free // software; you can redistribute it and/or modify it under the terms // of the GNU General Public License as published by the Free Software // Foundation; either version 2, or (at your option) any later // version. // This library is distributed in the hope that it will be useful, but // WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // General Public License for more details. // You should have received a copy of the GNU General Public License // along with this library; see the file COPYING. If not, write to // the Free Software Foundation, 59 Temple Place - Suite 330, Boston, // MA 02111-1307, USA. // As a special exception, you may use this file as part of a free // software library without restriction. Specifically, if other files // instantiate templates or use macros or inline functions from this // file, or you compile this file and link it with other files to // produce an executable, this file does not by itself cause the // resulting executable to be covered by the GNU General Public // License. This exception does not however invalidate any other // reasons why the executable file might be covered by the GNU General // Public License. /** @file parallel/workstealing.h * @brief Parallelization of embarrassingly parallel execution by * means of work-stealing. * * Work stealing is described in * * R. D. Blumofe and C. E. Leiserson. * Scheduling multithreaded computations by work stealing. * Journal of the ACM, 46(5):720–748, 1999. * * This file is a GNU parallel extension to the Standard C++ Library. */ // Written by Felix Putze. #ifndef _GLIBCXX_PARALLEL_WORKSTEALING_H #define _GLIBCXX_PARALLEL_WORKSTEALING_H 1 #include #include #include namespace __gnu_parallel { #define _GLIBCXX_JOB_VOLATILE volatile /** @brief One job for a certain thread. */ template struct Job { typedef _DifferenceTp difference_type; /** @brief First element. * * Changed by owning and stealing thread. By stealing thread, * always incremented. */ _GLIBCXX_JOB_VOLATILE difference_type first; /** @brief Last element. * * Changed by owning thread only. */ _GLIBCXX_JOB_VOLATILE difference_type last; /** @brief Number of elements, i. e. @c last-first+1. * * Changed by owning thread only. */ _GLIBCXX_JOB_VOLATILE difference_type load; }; /** @brief Work stealing algorithm for random access iterators. * * Uses O(1) additional memory. Synchronization at job lists is * done with atomic operations. * @param begin Begin iterator of element sequence. * @param end End iterator of element sequence. * @param op User-supplied functor (comparator, predicate, adding * functor, ...). * @param f Functor to "process" an element with op (depends on * desired functionality, e. g. for std::for_each(), ...). * @param r Functor to "add" a single result to the already * processed elements (depends on functionality). * @param base Base value for reduction. * @param output Pointer to position where final result is written to * @param bound Maximum number of elements processed (e. g. for * std::count_n()). * @return User-supplied functor (that may contain a part of the result). */ template Op for_each_template_random_access_workstealing(RandomAccessIterator begin, RandomAccessIterator end, Op op, Fu& f, Red r, Result base, Result& output, typename std::iterator_traits :: difference_type bound) { _GLIBCXX_CALL(end - begin) typedef std::iterator_traits traits_type; typedef typename traits_type::difference_type difference_type; const _Settings& __s = _Settings::get(); difference_type chunk_size = static_cast(__s.workstealing_chunk_size); // How many jobs? difference_type length = (bound < 0) ? (end - begin) : bound; // To avoid false sharing in a cache line. const int stride = __s.cache_line_size * 10 / sizeof(Job) + 1; // Total number of threads currently working. thread_index_t busy = 0; Job *job; omp_lock_t output_lock; omp_init_lock(&output_lock); // Write base value to output. output = base; // No more threads than jobs, at least one thread. thread_index_t num_threads = __gnu_parallel::max(1, __gnu_parallel::min(length, get_max_threads())); # pragma omp parallel shared(busy) num_threads(num_threads) { # pragma omp single { num_threads = omp_get_num_threads(); // Create job description array. job = new Job[num_threads * stride]; } // Initialization phase. // Flags for every thread if it is doing productive work. bool iam_working = false; // Thread id. thread_index_t iam = omp_get_thread_num(); // This job. Job& my_job = job[iam * stride]; // Random number (for work stealing). thread_index_t victim; // Local value for reduction. Result result = Result(); // Number of elements to steal in one attempt. difference_type steal; // Every thread has its own random number generator // (modulo num_threads). random_number rand_gen(iam, num_threads); // This thread is currently working. # pragma omp atomic ++busy; iam_working = true; // How many jobs per thread? last thread gets the rest. my_job.first = static_cast(iam * (length / num_threads)); my_job.last = (iam == (num_threads - 1)) ? (length - 1) : ((iam + 1) * (length / num_threads) - 1); my_job.load = my_job.last - my_job.first + 1; // Init result with first value (to have a base value for reduction). if (my_job.first <= my_job.last) { // Cannot use volatile variable directly. difference_type my_first = my_job.first; result = f(op, begin + my_first); ++my_job.first; --my_job.load; } RandomAccessIterator current; # pragma omp barrier // Actual work phase // Work on own or stolen start while (busy > 0) { // Work until no productive thread left. # pragma omp flush(busy) // Thread has own work to do while (my_job.first <= my_job.last) { // fetch-and-add call // Reserve current job block (size chunk_size) in my queue. difference_type current_job = fetch_and_add(&(my_job.first), chunk_size); // Update load, to make the three values consistent, // first might have been changed in the meantime my_job.load = my_job.last - my_job.first + 1; for (difference_type job_counter = 0; job_counter < chunk_size && current_job <= my_job.last; ++job_counter) { // Yes: process it! current = begin + current_job; ++current_job; // Do actual work. result = r(result, f(op, current)); } # pragma omp flush(busy) } // After reaching this point, a thread's job list is empty. if (iam_working) { // This thread no longer has work. # pragma omp atomic --busy; iam_working = false; } difference_type supposed_first, supposed_last, supposed_load; do { // Find random nonempty deque (not own), do consistency check. yield(); # pragma omp flush(busy) victim = rand_gen(); supposed_first = job[victim * stride].first; supposed_last = job[victim * stride].last; supposed_load = job[victim * stride].load; } while (busy > 0 && ((supposed_load <= 0) || ((supposed_first + supposed_load - 1) != supposed_last))); if (busy == 0) break; if (supposed_load > 0) { // Has work and work to do. // Number of elements to steal (at least one). steal = (supposed_load < 2) ? 1 : supposed_load / 2; // Push victim's start forward. difference_type stolen_first = fetch_and_add( &(job[victim * stride].first), steal); difference_type stolen_try = stolen_first + steal - difference_type(1); my_job.first = stolen_first; my_job.last = __gnu_parallel::min(stolen_try, supposed_last); my_job.load = my_job.last - my_job.first + 1; // Has potential work again. # pragma omp atomic ++busy; iam_working = true; # pragma omp flush(busy) } # pragma omp flush(busy) } // end while busy > 0 // Add accumulated result to output. omp_set_lock(&output_lock); output = r(output, result); omp_unset_lock(&output_lock); } delete[] job; // Points to last element processed (needed as return value for // some algorithms like transform) f.finish_iterator = begin + length; omp_destroy_lock(&output_lock); return op; } } // end namespace #endif // -*- C++ -*- // Copyright (C) 2007 Free Software Foundation, Inc. // // This file is part of the GNU ISO C++ Library. This library is free // software; you can redistribute it and/or modify it under the terms // of the GNU General Public License as published by the Free Software // Foundation; either version 2, or (at your option) any later // version. // This library is distributed in the hope that it will be useful, but // WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // General Public License for more details. // You should have received a copy of the GNU General Public License // along with this library; see the file COPYING. If not, write to // the Free Software Foundation, 59 Temple Place - Suite 330, Boston, // MA 02111-1307, USA. // As a special exception, you may use this file as part of a free // software library without restriction. Specifically, if other files // instantiate templates or use macros or inline functions from this // file, or you compile this file and link it with other files to // produce an executable, this file does not by itself cause the // resulting executable to be covered by the GNU General Public // License. This exception does not however invalidate any other // reasons why the executable file might be covered by the GNU General // Public License. /** @file parallel/equally_split.h * This file is a GNU parallel extension to the Standard C++ Library. */ // Written by Johannes Singler. #ifndef _GLIBCXX_PARALLEL_EQUALLY_SPLIT_H #define _GLIBCXX_PARALLEL_EQUALLY_SPLIT_H 1 namespace __gnu_parallel { /** @brief Function to split a sequence into parts of almost equal size. * * The resulting sequence s of length num_threads+1 contains the splitting * positions when splitting the range [0,n) into parts of almost * equal size (plus minus 1). The first entry is 0, the last one * n. There may result empty parts. * @param n Number of elements * @param num_threads Number of parts * @param s Splitters * @returns End of splitter sequence, i. e. @c s+num_threads+1 */ template OutputIterator equally_split(difference_type n, thread_index_t num_threads, OutputIterator s) { difference_type chunk_length = n / num_threads; difference_type num_longer_chunks = n % num_threads; difference_type pos = 0; for (thread_index_t i = 0; i < num_threads; ++i) { *s++ = pos; pos += (i < num_longer_chunks) ? (chunk_length + 1) : chunk_length; } *s++ = n; return s; } /** @brief Function to split a sequence into parts of almost equal size. * * Returns the position of the splitting point between * thread number thread_no (included) and * thread number thread_no+1 (excluded). * @param n Number of elements * @param num_threads Number of parts * @returns _SplittingAlgorithm point */ template difference_type equally_split_point(difference_type n, thread_index_t num_threads, thread_index_t thread_no) { difference_type chunk_length = n / num_threads; difference_type num_longer_chunks = n % num_threads; if (thread_no < num_longer_chunks) return thread_no * (chunk_length + 1); else return num_longer_chunks * (chunk_length + 1) + (thread_no - num_longer_chunks) * chunk_length; } } #endif // -*- C++ -*- // Copyright (C) 2007, 2008 Free Software Foundation, Inc. // // This file is part of the GNU ISO C++ Library. This library is free // software; you can redistribute it and/or modify it under the terms // of the GNU General Public License as published by the Free Software // Foundation; either version 2, or (at your option) any later // version. // This library is distributed in the hope that it will be useful, but // WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // General Public License for more details. // You should have received a copy of the GNU General Public License // along with this library; see the file COPYING. If not, write to // the Free Software Foundation, 59 Temple Place - Suite 330, Boston, // MA 02111-1307, USA. // As a special exception, you may use this file as part of a free // software library without restriction. Specifically, if other files // instantiate templates or use macros or inline functions from this // file, or you compile this file and link it with other files to // produce an executable, this file does not by itself cause the // resulting executable to be covered by the GNU General Public // License. This exception does not however invalidate any other // reasons why the executable file might be covered by the GNU General // Public License. /** @file parallel/partition.h * @brief Parallel implementation of std::partition(), * std::nth_element(), and std::partial_sort(). * This file is a GNU parallel extension to the Standard C++ Library. */ // Written by Johannes Singler and Felix Putze. #ifndef _GLIBCXX_PARALLEL_PARTITION_H #define _GLIBCXX_PARALLEL_PARTITION_H 1 #include #include #include #include #include /** @brief Decide whether to declare certain variables volatile. */ #define _GLIBCXX_VOLATILE volatile namespace __gnu_parallel { /** @brief Parallel implementation of std::partition. * @param begin Begin iterator of input sequence to split. * @param end End iterator of input sequence to split. * @param pred Partition predicate, possibly including some kind of pivot. * @param num_threads Maximum number of threads to use for this task. * @return Number of elements not fulfilling the predicate. */ template typename std::iterator_traits::difference_type parallel_partition(RandomAccessIterator begin, RandomAccessIterator end, Predicate pred, thread_index_t num_threads) { typedef std::iterator_traits traits_type; typedef typename traits_type::value_type value_type; typedef typename traits_type::difference_type difference_type; difference_type n = end - begin; _GLIBCXX_CALL(n) const _Settings& __s = _Settings::get(); // Shared. _GLIBCXX_VOLATILE difference_type left = 0, right = n - 1; _GLIBCXX_VOLATILE difference_type leftover_left, leftover_right; _GLIBCXX_VOLATILE difference_type leftnew, rightnew; bool* reserved_left = NULL, * reserved_right = NULL; difference_type chunk_size; omp_lock_t result_lock; omp_init_lock(&result_lock); //at least two chunks per thread if(right - left + 1 >= 2 * num_threads * chunk_size) # pragma omp parallel num_threads(num_threads) { # pragma omp single { num_threads = omp_get_num_threads(); reserved_left = new bool[num_threads]; reserved_right = new bool[num_threads]; if (__s.partition_chunk_share > 0.0) chunk_size = std::max(__s.partition_chunk_size, (double)n * __s.partition_chunk_share / (double)num_threads); else chunk_size = __s.partition_chunk_size; } while (right - left + 1 >= 2 * num_threads * chunk_size) { # pragma omp single