CnC
dist_cnc.h
00001 /* *******************************************************************************
00002  *  Copyright (c) 2007-2014, Intel Corporation
00003  *
00004  *  Redistribution and use in source and binary forms, with or without
00005  *  modification, are permitted provided that the following conditions are met:
00006  *
00007  *  * Redistributions of source code must retain the above copyright notice,
00008  *    this list of conditions and the following disclaimer.
00009  *  * Redistributions in binary form must reproduce the above copyright
00010  *    notice, this list of conditions and the following disclaimer in the
00011  *    documentation and/or other materials provided with the distribution.
00012  *  * Neither the name of Intel Corporation nor the names of its contributors
00013  *    may be used to endorse or promote products derived from this software
00014  *    without specific prior written permission.
00015  *
00016  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
00017  *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00018  *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
00019  *  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
00020  *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00021  *  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
00022  *  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00023  *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
00024  *  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
00025  *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00026  ********************************************************************************/
00027 
00028 // ===============================================================================
00029 // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
00030 //
00031 //    INCLUDE THIS FILE ONLY TO MAKE YOUR PROGRAM READY FOR DISTRIBUTED CnC
00032 //
00033 // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
00034 // ===============================================================================
00035 
00036 #ifndef __DIST_CNC__H_
00037 #define __DIST_CNC__H_
00038 
00039 /**
00040 \page distcnc Running CnC applications on distributed memory
00041 
00042 In principle, every clean CnC program should be immediately
00043 applicable for distributed memory systems. With only a few trivial
00044 changes most CnC programs can be made distribution-ready. You will
00045 get a binary that runs on shared and distributed memory.  Most of
00046 the mechanics of data distribution etc. is handled inside the
00047 runtime and the programmer does not need to bother about the gory
00048 details. Of course, there are a few minor changes needed to make a
00049 program distribution-ready, but once that's done, it will run on
00050 distributed CnC as well as on "normal" CnC (decided at runtime).
00051 
00052 \section dc_comm Inter-process communication
00053 Conceptually, CnC allows data and computation distribution
00054 across any kind of network; currently CnC supports SOCKETS and MPI.
00055 
00056 \section dc_link Linking for distCnC
00057 Support for distributed memory is part of the "normal" CnC
00058 distribution, e.g. it comes with the necessary communication
00059 libraries (cnc_socket, cnc_mpi). The communication library is
00060 loaded on demand at runtime, hence you do not need to link against
00061 extra libraries to create distribution-ready applications. Just
00062 link your binaries like a "traditional" CnC application (explained
00063 in the CnC User Guide, which can be found in the doc directory).
00064 \note a distribution-ready CnC application-binary has no dependencies
00065       on an MPI library, it can be run on shared memory or over SOCKETS
00066       even if no MPI is available on the system
00067 
00068 Even though it is not a separate package or module in the CNC kit,
00069 in the following we will refer to features that are specific for
00070 distributed memory with "distCnC".
00071 
00072 \section dc_prog Making your program distCnC-ready
00073 As a distributed version of a CnC program needs to do things which
00074 are not required in a shared memory version, the extra code for
00075 distCnC is hidden from "normal" CnC headers. To include the
00076 features required for a distributed version you need to
00077 \code #include <cnc/dist_cnc.h> \endcode
00078 instead of \code #include <cnc/cnc.h> \endcode .
00079 If you want to be able to create optimized binaries for shared
00080 memory and distributed memory from the same source, you might
00081 consider protecting distCnC specifics like this:
00082   @code
00083    #ifdef _DIST_
00084    # include <cnc/dist_cnc.h>
00085    #else
00086    # include <cnc/cnc.h>
00087    #endif
00088   @endcode
00089 
00090 In "main", initialize an object CnC::dist_cnc_init< list-of-contexts >
00091 before anything else; parameters should be all context-types that
00092 you would like to be distributed. Context-types not listed in here
00093 will stay local. You may mix local and distributed contexts, but
00094 in most cases only one context is needed/used anyway.
00095   @code
00096    #ifdef _DIST_
00097        CnC::dist_cnc_init< my_context_type_1 //, my_context_type_2, ...
00098                          > _dinit;
00099    #endif
00100   @endcode
00101 
00102 Even though the communication between process is entirely handled by
00103 the CnC runtime, C++ doesn't allow automatic marshaling/serialization
00104 of arbitrary data-types. Hence, if and only if your items and/or tags
00105 are non-standard data types, the compiler will notify you about the
00106 need for serialization/marshaling capability. If you are using
00107 standard data types only then marshaling will be handled by CnC
00108 automatically.
00109 
00110 Marshaling doesn't involve sending messages or alike, it only
00111 specifies how an object/variable is packed/unpacked into/from a
00112 buffer. Marshaling of structs/classes without pointers or virtual
00113 functions can easily be enabled using
00114 \code CNC_BITWISE_SERIALIZABLE( type); \endcode
00115 others need a "serialize" method or function. The CnC kit comes
00116 with an convenient interface for this which is similar to BOOST
00117 serialization. It is very simple to use and requires only one
00118 function/method for packing and unpacking. See \ref serialization for
00119 more details.
00120 
00121 <b>This is it! Your CnC program will now run on distributed memory!</b>
00122 
00123 \attention Global variables are evil and must not be used within
00124            the execution scope of steps. Read \ref dist_global
00125            about how CnC supports global read-only data.
00126            Apparently, pointers are nothing else than global
00127            variables and hence need special treatment in distCnC
00128            (see \ref serialization).
00129 \note Even if your program runs on distributed memory, that does not
00130       necessarily imply that the trivial extension above will make it
00131       run fast.  Please consult \ref dist_tuning for the tuning
00132       options for distributed memory.
00133 
00134 The above describes the default "single-program" approach for
00135 distribution.  Please refer to to CnC::dist_cnc_init for more advanced
00136 modes which allow SPMD-style interaction as well as distributing 
00137 parts of the CnC program over groups of processes.
00138 
00139 
00140 \section dc_run Running distCnC
00141 The communication infrastructure used by distCnC is chosen at
00142 runtime.  By default, the CnC runtime will run your application in
00143 shared memory mode.  When starting up, the runtime will evaluate
00144 the environment variable "DIST_CNC".  Currently it accepts the
00145 following values
00146 - SHMEM : shared memory (default)
00147 - SOCKETS : communication through TCP sockets
00148 - MPI : using Intel(R) MPI
00149 
00150 Please see \ref itac on how to profile distributed programs
00151 
00152 \subsection dc_sockets Using SOCKETS
00153 On application start-up, when DIST_CNC=SOCKETS, CnC checks the
00154 environment variable "CNC_SOCKET_HOST".  If it is set to a number,
00155 it will print a contact string and wait for the given number of
00156 clients to connect. Usually this means that clients need to be
00157 started "manually" as follows: set DIST_CNC=SOCKETS and
00158 "CNC_SOCKET_CLIENT" to the given contact string and launch the
00159 same executable on the desired machine.
00160 
00161 You can also manually provide the hostname and port number by
00162 setting the enironment variables CNC_SOCKET_HOST and 
00163 CNC_SOCKET_HOSTNAME.
00164 
00165 If "CNC_SOCKET_HOST" is not a number it is interpreted as a
00166 name of a script. CnC executes the script twice: First with "-n"
00167 it expects the script to return the number of clients it will
00168 start. The second invocation is expected to launch the client
00169 processes. If the returned number of clients if prepended with
00170 '+' CnC will assume that the script starts all client processes
00171 in one one go when called the second time (without -n). If it's
00172 a plain positive integer CnC will run the script once for each
00173 client process individually.
00174 
00175 CnC also sets 2 environment variables that the script can read:
00176 - CNC_HOST_EXECUTABLE : the name of the executable that's run
00177                         by the host process
00178 - CNC_HOST_ARGS : the command-line arguments passed to the host
00179                   process
00180 
00181 Three example scripts are provided:
00182 - misc/distributed/socket/start.sh : starts each client individually
00183 - misc/distributed/socket/start_batch.sh : starts all clients in one go
00184 - misc/distributed/socket/start_mpirun.sh: uses mpirun to start all clients together
00185 
00186 All scripts require password-less ssh login (or whatever MPI is
00187 configured to use if you use the mpirun script). Set the env var
00188 CNC_NUM_CLIENTS to the number of clients you want to start.  To
00189 facilitate the use of different machines they also read the env var
00190 CNC_HOST_FILE. If found, they will read the hostnames to use from the
00191 given file (expect one hostname per line). If CNC_HOST_FILE is not
00192 specified all clients get started on localhost.
00193 
00194 For windows, the script "start.bat" does the same as start.sh,
00195 except that it will start the clients on the same machine without
00196 ssh or alike. Adjust the script to use your preferred remote login
00197 mechanism.
00198 
00199 \subsection dc_mpi MPI
00200 CnC comes with a communication layer based on MPI. You need the
00201 Intel(R) MPI runtime to use it. You can download a free version of
00202 the MPI runtime from
00203 http://software.intel.com/en-us/articles/intel-mpi-library/ (under
00204 "Resources").  A distCnC application is launched like any other
00205 MPI application with mpirun or mpiexec, but DIST_CNC must be set
00206 to MPI:
00207 \code
00208 env DIST_CNC=MPI mpiexec -n 4 my_cnc_program
00209 \endcode
00210 Alternatively, just run the app as usually (with DIST_CNC=MPI) and
00211 control the number (n) of additionally spawned processes with
00212 CNC_MPI_SPAWN=n.  If host and client applications need to be
00213 different, set CNC_MPI_EXECUTABLE to the client-program
00214 name. Here's an example:
00215 \code
00216 env DIST_CNC=MPI CNC_MPI_SPAWN=3 CNC_MPI_EXECUTABLE=cnc_client cnc_host
00217 \endcode
00218 It starts your host executable "cnc_host" and then spawns 3 additional 
00219 processes which all execute the client executable "cnc_client".
00220 
00221 \subsection dc_mic Intel Xeon Phi(TM) (MIC)
00222 for CnC a MIC process is just another process where work can be computed
00223 on. So all you need to do is
00224 - Build your application for MIC (see
00225   http://software.intel.com/en-us/articles/intel-concurrent-collections-getting-started)
00226 - Start a process with the MIC executable on each MIC card just
00227   like on a CPU. Communication and Startup is equivalent to how it
00228   works on intel64 (\ref dc_mpi and \ref dc_sockets).
00229 
00230 \note Of course the normal mechanics for MIC need to be considered
00231       (like getting applications and dependent libraries to the MIC
00232       first).  You'll find documentation about this on IDZ, like 
00233       <A HREF="http://software.intel.com/en-us/articles/how-to-run-intel-mpi-on-xeon-phi">here</A>
00234       and/or <A HREF="http://software.intel.com/en-us/articles/using-the-intel-mpi-library-on-intel-xeon-phi-coprocessor-systems">here</A>
00235 \note We recommend starting only 2 threads per MIC-core, e.g. if your
00236       card has 60 cores, set CNC_NUM_THREADS=120
00237 \note To start different binaries with one mpirun/mpiexec command you
00238       can use a syntax like this:<br>
00239       mpirun -genv DIST_CNC=MPI -n 2 -host xeon xeonbinary : -n 1 -host mic0 -env CNC_NUM_THREADS=120 micbinary
00240 
00241 
00242 \section def_dist Default Distribution
00243 Step instances are distributed across clients and the host. By
00244 default, they are distributed in a round-robin fashion. Note that
00245 every process can put tags (and so prescribe new step instances).
00246 The round-robin distribution decision is made locally on each
00247 process (not globally).
00248 
00249 If the same tag is put multiple times, the default scheduling
00250 might execute the multiply prescribed steps on different processes
00251 and the preserveTags attribute of tag_collections will then not
00252 have the desired effect.
00253 
00254 The default scheduling is intended primarily as a development aid.
00255 your CnC application will be distribution ready with only little effort.
00256 In some cases it might lead to good performance, in other cases
00257 a sensible distribution is needed to achieve good performance.
00258 See \ref dist_tuning.
00259 
00260 Next: \ref dist_tuning
00261 
00262 
00263 \page dist_tuning Tuning for distributed memory
00264 The CnC tuning interface provides convenient ways to control the 
00265 distribution of work and data across the address spaces. The
00266 tuning interface is separate from the actual step-code and its
00267 declarative nature allows flexible and productive experiments with
00268 different distribution strategies.
00269 
00270 \section dist_work Distributing the work
00271 Let's first look at the distribution of work/steps.  You can specify
00272 the distribution of work (e.g. steps) across the network by providing
00273 a tuner to a step-collection (the second template argument to
00274 CnC::step_collection, see \ref tuning). Similar to other tuning
00275 features, the tuner defines the distribution plan based on the
00276 control-tags and item-tags. For a given instance (identified by the
00277 control-tag) the tuner defines the placement of the instance in the
00278 communication network. This mechanism allows a declarative definition
00279 of the distribution and keeps it separate from the actual program code
00280 - you can change the distribution without changing the actual program.
00281 
00282 The method for distributing steps is called "compute_on". It takes the
00283 tag of the step and the context as arguments and has to return the
00284 process number to run the step on. The numbering of processes is
00285 similar to ranks in MPI. Running on "N" processes, the host process is
00286 "0" and the last client "N-1".
00287 
00288   @code
00289    struct my_tuner : public CnC::step_tuner<>
00290    {
00291        int compute_on( const tag_type & tag, context_type & ) const { return tag % numProcs(); }
00292    };
00293   @endcode
00294 
00295 The shown tuner is derived from CnC::step_tuner. To allow a flexible
00296 and generic definition of the distribution CnC::step_tuner provides
00297 information specific for distributed memory:
00298 CnC::tuner_base::numProcs() and CnC::tuner_base::myPid(). Both return
00299 the values of the current run of your application.  Using those allows
00300 defining a distribution plan which adapts to the current runtime
00301 configuration.
00302 
00303 If you wonder how the necessary gets distributed - this will be
00304 covered soon. Let's first look at the computation side a bit more
00305 closely; but if you can't wait see \ref dist_data.
00306 
00307 The given tuner above simply distributes the tags in a
00308 round-robin fashion by applying the modulo operator on the tag. Here's
00309 an example of how a given set of tags would be mapped to 4 processes
00310 (e.g. numProcs()==4):
00311 \verbatim
00312 1  -> 1
00313 3  -> 3
00314 4  -> 0
00315 5  -> 1
00316 10 -> 2
00317 20 -> 0
00318 31 -> 3
00319 34 -> 2
00320 \endverbatim
00321 
00322 An example of such a simple tuner is \ref bs_tuner.
00323 
00324 Now let's do something a little more interesting. Let's assume our tag
00325 is a pair of x and y coordinates. To distribute the work per row, we
00326 could simply do something like
00327 
00328   @code
00329    struct my_tuner : public CnC::step_tuner<>
00330    {
00331        int compute_on( const tag_type & tag, context_type & ) const { return tag.y % numProcs(); }
00332    };
00333   @endcode
00334 
00335 As you see, the tuner entirely ignores the x-part of the tag. This
00336 means that all entries on a given row (identified by tag.y) gets
00337 executed on the same process.  Similarly, if you want to distribute
00338 the work per column instead, you simply change it to
00339 
00340   @code
00341    struct my_tuner : public CnC::step_tuner<>
00342    {
00343        int compute_on( const tag_type & tag, context_type & ) const { return tag.x % numProcs(); }
00344    };
00345   @endcode
00346 
00347 As we'll also see later, you can certainly also conditionally switch
00348 between row- and column-wise (or any other) distribution within
00349 compute_on.
00350 
00351 To avoid the afore-mentioned problem of becoming globally
00352 inconsistent, you should make sure that the return value is
00353 independent of the process it is executed on.
00354 
00355 CnC provides special values to make working with compute_on more
00356 convenient, more generic and more effective:
00357 CnC::COMPUTE_ON_LOCAL, CnC::COMPUTE_ON_ROUND_ROBIN,
00358 CnC::COMPUTE_ON_ALL, CnC::COMPUTE_ON_ALL_OTHERS.
00359 
00360 \section dist_data Distributing the data
00361 By default, the CnC runtime will deliver data items automatically
00362 to where they are needed. In its current form, the C++ API does
00363 not express the dependencies between instances of steps and/or
00364 items. Hence, without additional information, the runtime does not
00365 know what step-instances produce and consume which
00366 item-instances. Even when the step-distribution is known
00367 automatically automatic distribution of data requires
00368 global communication. Apparently this constitutes a considerable
00369 bottleneck. The CnC tuner interface provides two ways to reduce
00370 this overhead.
00371 
00372 The ideal, most flexible and most efficient approach is to map
00373 items to their consumers.  It will convert the default pull-model
00374 to a push-model: whenever an item becomes produced, it will be
00375 sent only to those processes, which actually need it without any
00376 other communication/synchronization. If you can determine which
00377 steps are going to consume a given item, you can use the above
00378 compute_on to map the consumer step to the actual address
00379 spaces. This allows changing the distribution at a single place
00380 (compute_on) and the data distribution will be automatically
00381 optimized to the minimum needed data transfer.
00382 
00383 The runtime evaluates the tuner provided to the item-collection
00384 when an item is put.  If its method consumed_on (from
00385 CnC::item_tuner) returns anything other than CnC::CONSUMER_UNKNOWN
00386 it will send the item to the returned process id and avoid all the
00387 overhead of requesting the item when consumed.
00388   @code
00389    struct my_tuner : public CnC::item_tuner< tag_type, item_type >
00390    {
00391        int consumed_on( const tag_type & tag ) 
00392        {
00393            return my_step_tuner::consumed_on( consumer_step );
00394        }
00395    };
00396   @endcode
00397 
00398 As more than one process might consume the item, you
00399 can also return a vector of ids (instead of a single id) and the
00400 runtime will send the item to all given processes.
00401   @code
00402    struct my_tuner : public CnC::item_tuner< tag_type, item_type >
00403    {
00404        std::vector< int > consumed_on( const tag_type & tag ) 
00405        {
00406            std::vector< int > consumers;
00407            foreach( consumer_step of tag ) {
00408                int _tmp = my_step_tuner::consumed_on( consumer_step );
00409                consumers.push_back( _tmp );
00410            }
00411            return consumers;
00412        }
00413    };
00414   @endcode
00415 
00416 Like for compute_on, CnC provides special values to facilitate and
00417 generalize the use of consumed_on: CnC::CONSUMER_UNKNOWN,
00418 CnC::CONSUMER_LOCAL, CnC::CONSUMER_ALL and
00419 CnC::CONSUMER_ALL_OTHERS.
00420 
00421 Note that consumed_on can return CnC::CONSUMER_UNKOWN for some
00422 item-instances, and process rank(s) for others.
00423 
00424 Sometimes the program semantics make it easier to think about the
00425 producer of an item. CnC provides a mechanism to keep the
00426 pull-model but allows declaring the owner/producer of the item. If
00427 the producer of an item is specified the CnC-runtime can
00428 significantly reduce the communication overhead because it on
00429 longer requires global communication to find the owner of the
00430 item. For this, simply define the depends-method in your
00431 step-tuner (derived from CnC::step_tuner) and provide the
00432 owning/producing process as an additional argument.
00433 
00434   @code
00435    struct my_tuner : public CnC::step_tuner<>
00436    {
00437        int produced_on( const tag_type & tag ) const
00438        {
00439            return producer_known ? my_step_tuner::consumed_on( tag ) : tag % numProcs();
00440        }
00441    };
00442   @endcode
00443 
00444 Like for consumed_on, CnC provides special values
00445 CnC::PRODUCER_UNKNOWN and CnC::PRODUCER_LOCAL to facilitate and
00446 generalize the use of produced_on.
00447 
00448 The push-model consumed_on smoothly cooperates with the
00449 pull-model as long as they don't conflict.
00450 
00451 \section dist_sync Keeping data and work distribution in sync
00452 For a more productive development, you might consider implementing
00453 consumed_on by thinking about which other steps (not processes)
00454 consume the item. With that knowledge you can easily use the
00455 appropriate compute_on function to determine the consuming process.
00456 The great benefit here is that you can then change compute
00457 distribution (e.g. change compute_on) and the data will automatically
00458 follow in an optimal way; data and work distribution will always be in
00459 sync.  It allows experimenting with different distribution plans with
00460 much less trouble and lets you define different strategies at a single
00461 place.  Here is a simple example code which lets you select different
00462 strategies at runtime. Adding a new strategy only requires extending
00463 the compute_on function:
00464 \ref bs_tuner
00465 A more complex example is this one: \ref cholesky_tuner
00466 
00467 \section dist_global Using global read-only data with distCnC
00468 Many algorithms require global data that is initialized once and
00469 during computation it stays read-only (dynamic single assignment,
00470 DSA).  In principle this is aligned with the CnC methodology as
00471 long as the initialization is done from the environment.  The CnC
00472 API allows global DSA data through the context, e.g. you can store
00473 global data in the context, initialize it there and then use it in
00474 a read-only fashion within your step codes.
00475 
00476 The internal mechanism works as follows: on remote processes the
00477 user context is default constructed and then
00478 de-serialized/un-marshaled. On the host, construction and
00479 serialization/marshaling is done in a lazy manner, e.g. not
00480 before something actually needs being transferred.  This allows
00481 creating contexts on the host with non-default constructors, but
00482 it requires overloading the serialize method of the context.  The
00483 actual time of transfer is not statically known, the earliest
00484 possible time is the first item- or tag-put. All changes to the
00485 context until that point will be duplicated remotely, later
00486 changes will not.
00487 
00488 Here is a simple example code which uses this feature:
00489 \ref bs_tuner
00490 
00491 Next: \ref non_cnc
00492 **/
00493 
00494 #ifdef _CnC_H_ALREADY_INCLUDED_
00495 #warning dist_cnc.h included after cnc.h. Distribution capabilities will not be activated.
00496 #endif
00497 
00498 #ifndef _DIST_CNC_
00499 # define _DIST_CNC_
00500 #endif
00501 
00502 #include <cnc/internal/dist/dist_init.h>
00503 
00504 namespace CnC {
00505     namespace Internal {
00506         class void_context;
00507     }
00508 
00509     /// To enable remote CnC you must create one such object.  The
00510     /// lifetime of the object defines the "scope" of
00511     /// distribution. Contexts created in the "scope" of the
00512     /// dist_cnc_init objects (e.g. when it exists) will get
00513     /// distributed to participating processes (see \ref dc_run).
00514     ///
00515     /// Usually, a single dist_cnc_init object is created for the
00516     /// entire lifetime of a program.  e.g. the dist_cnc_init object
00517     /// is created right when entering main and (auto-)destructed when
00518     /// main terminates. In this default mode all processes other than
00519     /// the root/host process exit the program when the dist_cnc_init
00520     /// objects gets dextructed.
00521     ///
00522     /// Actually, the current implementation allows only a single
00523     /// dist_cnc_init object at a time for every process.  Hence, all
00524     /// contexts on a given process are distributed in the same way.
00525     /// However, an optional parameter/flag allows allows defining the
00526     /// processes that actually "share" the dist_cnc_init object (and
00527     /// so their contexts). An optional flag/parameter is interpreted
00528     /// as a MPI_Comm to be used by the dist_cnc_init scope.  This
00529     /// allows different groups of processes (defined by the
00530     /// MPI_Comm's) to work on different CnC contexts/graphs
00531     /// concurrently. If no MPI_Comm was specified (e.g. the default)
00532     /// client processes exit the program when the host dist_cnc_init
00533     /// object is destructed. If a MPI_Comm is provided they also wait
00534     /// until the host process destructs its dist_cnc_init object but
00535     /// simply returns from the constructor rather than exiting the
00536     /// program.  Apparently all this only works when using the MPI
00537     /// communication infrastructure.
00538     ///
00539     /// Additionally, two modes of operation are supported:
00540     /// 1. By default, constructing a dist_cnc_init objects blocks all
00541     ///    processes except the root process in the constructor.
00542     ///    Hence, code after the object instantiation will be executed
00543     ///    only on the host process.
00544     /// 2. If dist_env is set to true, the constructor returns on all
00545     ///    processes and execution continues in a SPMD style, e.g. all
00546     ///    processes continue program execution. The SPMD style mode
00547     ///    allows alternating between MPI phases and CnC phases. This
00548     ///    mode is currently supported only using MPI communication.
00549     ///    You have to ensure that all processes fully completed their
00550     ///    local context creation before putting any data into a
00551     ///    context's collection. Similarly, you have to synchronize
00552     ///    context-destruction. It is recommended to put a MPI_Barrier
00553     ///    right after instantiating a context and just before it gets
00554     ///    destructed (e.g. at the end of its scope).
00555     ///
00556     /// \note It is possible to combine SPMD mode and providing a 
00557     ///       MPI_Comm. You can even change the grouping in phases by
00558     ///       using different MPI_Comm's at different times of the
00559     ///       execution. E.g. the lifetime of a dist_cnc_object might
00560     ///       be a (collective) function call. Make sure each process
00561     ///       has only single dist_cnc_object alive at each point in
00562     ///       time.
00563     ///
00564     /// \note All context classes ever used in the program must be
00565     ///       referenced as template arguments if they should be
00566     ///       distributed.
00567     /// \note All distributed contexts must have all
00568     ///       collections they use as members and must be
00569     ///       default-constructible.
00570     /// \note Pointers as tags are not supported by distCnC.
00571     ///
00572     /// Execution and other internal details described in
00573     /// CnC::Internal::dist_init
00574     template< class C1, class C2 = Internal::void_context, class C3 = Internal::void_context,
00575               class C4 = Internal::void_context, class C5 = Internal::void_context >
00576     struct /*CNC_API*/ dist_cnc_init : public Internal::dist_init< C1, C2, C3, C4, C5 >
00577     {
00578         dist_cnc_init() : Internal::dist_init< C1, C2, C3, C4, C5 >() {}
00579         /// \param dist_env enable SPMD-style access to contexts
00580         /// \param flag MPI_Comm to be used (MPI only)
00581         dist_cnc_init( bool dist_env, long flag = 0  ) : Internal::dist_init< C1, C2, C3, C4, C5 >( flag, dist_env ) {}
00582         /// \param dist_env enable SPMD-style access to contexts
00583         /// \param flag MPI_Comm to be used (MPI only)
00584         dist_cnc_init( long flag, bool dist_env = false ) : Internal::dist_init< C1, C2, C3, C4, C5 >( flag, dist_env ) {}
00585     };
00586 
00587 } // namespace CnC
00588 
00589 #include <cnc/cnc.h>
00590 
00591 #endif // __DIST_CNC__H_