-
Notifications
You must be signed in to change notification settings - Fork 454
/
Copy pathtest_jenkins.sh
executable file
·1272 lines (1061 loc) · 32 KB
/
test_jenkins.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/bin/bash -eEx
#
# Testing script for OpenUCX, to run from Jenkins CI
#
# Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2001-2023. ALL RIGHTS RESERVED.
# Copyright (C) ARM Ltd. 2016-2018. ALL RIGHTS RESERVED.
#
# See file LICENSE for terms.
#
#
# Environment variables set by Jenkins CI:
# - WORKSPACE : path to work dir
# - BUILD_NUMBER : azure build number
# - JOB_URL : azure job url
# - EXECUTOR_NUMBER : number of executor within the test machine
# - RUN_TESTS : whether to run unit tests
# - TEST_PERF : whether to validate performance
# - ASAN_CHECK : set to enable Address Sanitizer instrumentation build
# - VALGRIND_CHECK : set to enable running tests with Valgrind
#
# Optional environment variables (could be set by job configuration):
# - nworkers : number of parallel executors
# - worker : number of current parallel executor
#
source $(dirname $0)/../buildlib/az-helpers.sh
source $(dirname $0)/../buildlib/tools/common.sh
WORKSPACE=${WORKSPACE:=$PWD}
ucx_inst=${WORKSPACE}/install
if [ -z "$BUILD_NUMBER" ]; then
echo "Running interactive"
BUILD_NUMBER=1
WS_URL=file://$WORKSPACE
RUN_TESTS=yes
TEST_PERF=1
TIMEOUT=""
else
echo "Running under azure"
WS_URL=$JOB_URL/ws
if [[ "$VALGRIND_CHECK" == "yes" ]]; then
TIMEOUT="timeout 300m"
else
TIMEOUT="timeout 200m"
fi
fi
have_ptrace=$(capsh --print | grep 'Bounding' | grep ptrace || true)
have_strace=$(strace -V || true)
#
# Override maven repository path, to cache the downloaded packages across tests
#
export maven_repo=${WORKSPACE}/.deps
#
# Set up parallel test execution - "worker" and "nworkers" should be set by jenkins
#
if [ -z "$worker" ] || [ -z "$nworkers" ]
then
worker=0
nworkers=1
fi
echo "==== Running on $(hostname), worker $worker / $nworkers ===="
# Report an warning message to Azure pipeline
log_warning() {
msg=$1
test "x$RUNNING_IN_AZURE" = "xyes" && { azure_log_warning "${msg}" ; set -x; } || echo "${msg}"
}
# Report an error message to Azure pipeline
log_error() {
msg=$1
test "x$RUNNING_IN_AZURE" = "xyes" && { azure_log_error "${msg}" ; set -x; } || echo "${msg}"
}
#
# Check whether this test should do a task with given index,
# according to the parallel test execution parameters.
#
should_do_task() {
set +x
[[ $((task % nworkers)) -eq ${worker} ]]
rc=$?
set -x
return $rc
}
#
# Do a given task only if the current worker is supposed to do it.
#
do_distributed_task() {
set +x
task=$1
ntasks=$2
shift 2
if should_do_task $task $ntasks
then
echo "==== Running '$@' (task $task/$ntasks) ===="
set -x
$@
else
echo "==== Skipping '$@' (task $task/$ntasks) ===="
set -x
fi
}
#
# Take a list of tasks, and return only the ones this worker should do
#
get_my_tasks() {
set +x
task_list=$@
ntasks=$(echo $task_list|wc -w)
task=0
my_task_list=""
for item in $task_list
do
should_do_task $task $ntasks && my_task_list="$my_task_list $item"
task=$((task + 1))
done
echo $my_task_list
set -x
}
#
# Expands a CPU list such as "0-3,17" to "0 1 2 3 17" (each cpu in a new line)
#
expand_cpulist() {
cpulist=$1
tokens=$(echo ${cpulist} | tr ',' ' ')
for token in ${tokens}
do
# if there is no '-', first and last would be equal
first=$(echo ${token} | cut -d'-' -f1)
last=$( echo ${token} | cut -d'-' -f2)
for ((cpu=${first};cpu<=${last};++cpu))
do
echo ${cpu}
done
done
}
#
# Get the N'th CPU that the current process can run on
#
slice_affinity() {
set +x
n=$1
# get affinity mask of the current process
compact_cpulist=$(bash -c 'taskset -cp $$' | cut -d: -f2)
cpulist=$(expand_cpulist ${compact_cpulist})
echo "${cpulist}" | head -n $((n + 1)) | tail -1
set -x
}
run_loopback_app() {
test_exe=$1
test_args="-l $2"
affinity=$(slice_affinity 0)
taskset -c $affinity ${test_exe} ${test_args} &
pid=$!
wait ${pid} || true
}
run_client_server_app() {
test_exe=$1
test_args=$2
server_addr_arg=$3
kill_server=$4
error_emulation=$5
server_port_arg="-p $server_port"
step_server_port
affinity_server=$(slice_affinity 0)
affinity_client=$(slice_affinity 1)
taskset -c $affinity_server ${test_exe} ${test_args} ${server_port_arg} &
server_pid=$!
sleep 15
if [ $error_emulation -eq 1 ]
then
set +Ee
fi
taskset -c $affinity_client ${test_exe} ${test_args} ${server_addr_arg} ${server_port_arg} &
client_pid=$!
wait ${client_pid}
if [ $error_emulation -eq 1 ]
then
set -eE
fi
if [ $kill_server -eq 1 ]
then
kill -9 ${server_pid} || true # ignore failure
fi
wait ${server_pid} || true
}
run_hello() {
api=$1
shift
test_args="$@"
test_name=${api}_hello_world
if [ ! -x ${test_name} ]
then
$MAKEP -C examples ${test_name}
fi
# set smaller timeouts so the test will complete faster
if [[ ${test_args} =~ "-e" ]]
then
export UCX_UD_TIMEOUT=15s
export UCX_RC_TIMEOUT=1ms
export UCX_RC_RETRY_COUNT=4
fi
if [[ ${test_args} =~ "-e" ]]
then
error_emulation=1
else
error_emulation=0
fi
run_client_server_app "./examples/${test_name}" "${test_args}" "-n $(hostname)" 0 ${error_emulation}
if [[ ${test_args} == *"-e"* ]]
then
unset UCX_UD_TIMEOUT
unset UCX_RC_TIMEOUT
unset UCX_RC_RETRY_COUNT
fi
unset UCX_PROTO_ENABLE
}
#
# Compile and run UCP hello world example
#
run_ucp_hello() {
if ./src/tools/info/ucx_info -e -u twe|grep ERROR
then
return # skip if cannot create ucp ep
fi
mem_types_list="host "
if [ "X$have_cuda" == "Xyes" ]
then
mem_types_list+="cuda cuda-managed "
fi
export UCX_KEEPALIVE_INTERVAL=1s
export UCX_KEEPALIVE_NUM_EPS=10
export UCX_LOG_LEVEL=info
export UCX_MM_ERROR_HANDLING=y
for tls in all tcp,cuda shm,cuda
do
export UCX_TLS=${tls}
for test_mode in -w -f -b -erecv -esend -ekeepalive
do
for mem_type in $mem_types_list
do
echo "==== Running UCP hello world with mode ${test_mode} and \"${mem_type}\" memory type ===="
run_hello ucp ${test_mode} -m ${mem_type}
done
done
done
rm -f ./ucp_hello_world
unset UCX_KEEPALIVE_INTERVAL
unset UCX_KEEPALIVE_NUM_EPS
unset UCX_LOG_LEVEL
unset UCX_TLS
unset UCX_MM_ERROR_HANDLING
}
#
# Compile and run UCT hello world example
#
run_uct_hello() {
mem_types_list="host "
if [ "X$have_cuda" == "Xyes" ] && [ -f "/sys/kernel/mm/memory_peers/nv_mem/version" ]
then
mem_types_list+="cuda-managed "
if [ -f "/sys/kernel/mm/memory_peers/nv_mem/version" ]
then
# test RDMA GPUDirect
mem_types_list+="cuda "
fi
fi
for send_func in -i -b -z
do
for ucx_dev in $(get_active_ib_devices)
do
for mem_type in $mem_types_list
do
echo "==== Running UCT hello world server on rc/${ucx_dev} with sending ${send_func} and \"${mem_type}\" memory type ===="
run_hello uct -d ${ucx_dev} -t "rc_verbs" ${send_func} -m ${mem_type}
done
done
for ucx_dev in $(get_active_ip_ifaces)
do
echo "==== Running UCT hello world server on tcp/${ucx_dev} with sending ${send_func} ===="
run_hello uct -d ${ucx_dev} -t "tcp" ${send_func}
done
done
rm -f ./uct_hello_world
}
run_client_server() {
test_name=ucp_client_server
mem_types_list="host"
msg_size_list="1 16 256 4096 65534"
api_list="am tag stream"
if [ "X$have_cuda" == "Xyes" ]
then
mem_types_list+=" cuda cuda-managed "
fi
if [ ! -x ${test_name} ]
then
$MAKEP -C examples ${test_name}
fi
server_ip=$1
if [ "$server_ip" == "" ]
then
return
fi
for mem_type in ${mem_types_list}
do
for api in ${api_list}
do
for msg_size in ${msg_size_list}
do
echo "==== Running UCP client-server with \"${mem_type}\" memory type using \"${api}\" API with msg_size={$msg_size} ===="
run_client_server_app "./examples/${test_name}" "-m ${mem_type} -c ${api} -s ${msg_size}" "-a ${server_ip}" 1 0
done
done
done
}
run_ucp_client_server() {
echo "==== Running UCP client-server ===="
run_client_server $(get_rdma_device_ip_addr)
run_client_server $(get_non_rdma_ip_addr)
run_client_server "127.0.0.1"
}
run_io_demo() {
server_rdma_addr=$(get_rdma_device_ip_addr)
server_nonrdma_addr=$(get_non_rdma_ip_addr)
mem_types_list="host "
if [ "X$have_cuda" == "Xyes" ]
then
mem_types_list+="cuda cuda-managed "
fi
if [ -z "$server_rdma_addr" ] && [ -z "$server_nonrdma_addr" ]
then
return
fi
for mem_type in $mem_types_list
do
echo "==== Running UCP IO demo with \"${mem_type}\" memory type ===="
test_args="$@ -o write,read -d 128:4194304 -P 2 -i 10000 -w 10 -c 5 -m ${mem_type} -q"
test_name=io_demo
for server_ip in $server_rdma_addr $server_nonrdma_addr
do
run_client_server_app "./test/apps/iodemo/${test_name}" "${test_args}" "${server_ip}" 1 0
done
if [ "${mem_type}" == "host" ]
then
run_client_server_app "./test/apps/iodemo/${test_name}" "${test_args}" "127.0.0.1" 1 0
fi
done
}
#
# Run UCX performance test
# Note: If requested running with MPI, MPI has to be initialized before
# The function accepts 0 (default value) or 1 that means launching w/ or w/o MPI
#
run_ucx_perftest() {
if [ $# -eq 0 ]
then
with_mpi=0
else
with_mpi=$1
fi
ucx_inst_ptest=$ucx_inst/share/ucx/perftest
# hack for perftest, no way to override params used in batch
# todo: fix in perftest
sed -s 's,-n [0-9]*,-n 100,g' $ucx_inst_ptest/msg_pow2 | sort -R > $ucx_inst_ptest/msg_pow2_short
cat $ucx_inst_ptest/test_types_uct | sort -R > $ucx_inst_ptest/test_types_short_uct
cat $ucx_inst_ptest/test_types_ucp | grep -v cuda | sort -R > $ucx_inst_ptest/test_types_short_ucp
cat $ucx_inst_ptest/test_types_ucp_rma | grep -v cuda | sort -R >> $ucx_inst_ptest/test_types_short_ucp
ucx_perftest="$ucx_inst/bin/ucx_perftest"
uct_test_args="-b $ucx_inst_ptest/test_types_short_uct \
-b $ucx_inst_ptest/msg_pow2_short -w 1"
ucp_test_args="-b $ucx_inst_ptest/test_types_short_ucp \
-b $ucx_inst_ptest/msg_pow2_short -w 1"
# IP ifaces
ip_ifaces=$(get_active_ip_ifaces)
# shared memory, IB devices, IP ifaces
devices="memory $(get_active_ib_devices) ${ip_ifaces}"
# Run on all devices
my_devices=$(get_my_tasks $devices)
for ucx_dev in $my_devices
do
if [[ $ucx_dev =~ .*mlx5.* ]]; then
opt_transports="-b $ucx_inst_ptest/transports"
tls=`awk '{print $3 }' $ucx_inst_ptest/transports | tr '\n' ',' | sed -r 's/,$//; s/mlx5/x/g'`
dev=$ucx_dev
elif [[ $ucx_dev =~ memory ]]; then
opt_transports="-x posix"
tls="shm"
dev="all"
elif [[ " ${ip_ifaces[*]} " == *" ${ucx_dev} "* ]]; then
opt_transports="-x tcp"
tls="tcp"
dev=$ucx_dev
else
opt_transports="-x rc_verbs"
tls="rc_v"
dev=$ucx_dev
fi
echo "==== Running ucx_perf kit on $ucx_dev ===="
if [ $with_mpi -eq 1 ]
then
# Run UCP performance test
which mpirun
$MPIRUN -np 2 -x UCX_NET_DEVICES=$dev -x UCX_TLS=$tls $ucx_perftest $ucp_test_args
# Run UCP loopback performance test
which mpirun
$MPIRUN -np 1 -x UCX_NET_DEVICES=$dev -x UCX_TLS=$tls $ucx_perftest $ucp_test_args "-l"
else
export UCX_NET_DEVICES=$dev
export UCX_TLS=$tls
# Run UCT performance test
run_client_server_app "$ucx_perftest" "$uct_test_args -d ${ucx_dev} ${opt_transports}" \
"$(hostname)" 0 0
# Run UCT loopback performance test
run_loopback_app "$ucx_perftest" "$uct_test_args -d ${ucx_dev} ${opt_transports}"
# Run UCP performance test
run_client_server_app "$ucx_perftest" "$ucp_test_args" "$(hostname)" 0 0
# Run UCP performance test with 2 threads
run_client_server_app "$ucx_perftest" "$ucp_test_args -T 2" "$(hostname)" 0 0
# Run UCP loopback performance test
run_loopback_app "$ucx_perftest" "$ucp_test_args"
unset UCX_NET_DEVICES
unset UCX_TLS
fi
done
# run cuda tests if cuda module was loaded and GPU is found, and only in
# client/server mode, to reduce testing time
if [ "X$have_cuda" == "Xyes" ] && [ $with_mpi -ne 1 ]
then
gdr_options="n "
if (lsmod | grep -q "nv_peer_mem")
then
echo "GPUDirectRDMA module (nv_peer_mem) is present.."
gdr_options+="y "
fi
if [ $num_gpus -gt 1 ]; then
export CUDA_VISIBLE_DEVICES=$(($worker%$num_gpus)),$(($(($worker+1))%$num_gpus))
fi
cat $ucx_inst_ptest/test_types_ucp | grep cuda | sort -R > $ucx_inst_ptest/test_types_short_ucp
sed -s 's,-n [0-9]*,-n 10 -w 1,g' $ucx_inst_ptest/msg_pow2 | sort -R > $ucx_inst_ptest/msg_pow2_short
echo "==== Running ucx_perf with cuda memory ===="
for memtype_cache in y n
do
for gdr in $gdr_options
do
export UCX_MEMTYPE_CACHE=$memtype_cache
export UCX_IB_GPU_DIRECT_RDMA=$gdr
run_client_server_app "$ucx_perftest" "$ucp_test_args" "$(hostname)" 0 0
unset UCX_MEMTYPE_CACHE
unset UCX_IB_GPU_DIRECT_RDMA
done
done
export UCX_TLS=self,shm,cma,cuda_copy
run_client_server_app "$ucx_perftest" "$ucp_test_args" "$(hostname)" 0 0
unset UCX_TLS
# Specifically test cuda_ipc for large message sizes
cat $ucx_inst_ptest/test_types_ucp | grep -v cuda | sort -R > $ucx_inst_ptest/test_types_cuda_ucp
ucp_test_args_large="-b $ucx_inst_ptest/test_types_cuda_ucp \
-b $ucx_inst_ptest/msg_pow2_large -w 1"
for ipc_cache in y n
do
export UCX_TLS=self,sm,cuda_copy,cuda_ipc
export UCX_CUDA_IPC_CACHE=$ipc_cache
run_client_server_app "$ucx_perftest" "$ucp_test_args_large" "$(hostname)" 0 0
unset UCX_CUDA_IPC_CACHE
unset UCX_TLS
done
echo "==== Running ucx_perf one-sided with cuda memory ===="
# Add RMA tests to the list of tests
cat $ucx_inst_ptest/test_types_ucp_rma | grep cuda | sort -R >> $ucx_inst_ptest/test_types_short_ucp
run_client_server_app "$ucx_perftest" "$ucp_test_args" "$(hostname)" 0 0
# Run AMO tests
echo -e "4 -s 4\n8 -s 8" > "$ucx_inst_ptest/msg_atomic"
ucp_test_args_atomic="-b $ucx_inst_ptest/test_types_ucp_amo \
-b $ucx_inst_ptest/msg_atomic \
-n 1000 -w 1"
run_client_server_app "$ucx_perftest" "$ucp_test_args_atomic" "$(hostname)" 0 0
unset CUDA_VISIBLE_DEVICES
fi
}
start_perftest_daemon() {
daemon_exe="$1"
# Find daemon port
dmn_port="$server_port"
step_server_port
# We explicitly disable cuda transport, because it's not p2p and therefore
# imposes INVALIDATE_RMA flag for all lanes (@see ucp_ep_config_init).
# However invalidating of imported rkeys is not supported by the daemon.
# TODO: Should we support invalidation of imported keys?
# Normally cuda_ipc cannot be used to communicate between host and DPU,
# unless we run both processes on host for testing purposes.
# Mandatory options to run the daemon
# - UCX_RNDV_THRESH=0 is needed to enforce RNDV protocol usage, as it's the
# only supported protocol between host and DPU
# - UCX_RNDV_SCHEME=put_zcopy. On low buffer dimensions (below ~8KB) UCX
# prefers bcopy over zero-copy, but bcopy workflow is not supported by DPU
# daemon. The workaround is to force rendezvous scheme to use zero-copy.
# get_zcopy option is not good enough, because bcopy is still selected for
# tiny messages (below 64 bytes)
dmn_env="UCX_TLS=^cuda UCX_TCP_CM_REUSEADDR=y UCX_RNDV_THRESH=0 UCX_RNDV_SCHEME=put_zcopy"
# Run the daemon
env $dmn_env $daemon_exe -p $dmn_port &
# Return the daemon pid and port
eval "$2=$!"
eval "$3=$dmn_port"
}
#
# Run UCX performance daemon test
#
run_ucx_perftest_with_daemon() {
ucx_inst_ptest=$ucx_inst/share/ucx/perftest
ucx_perftest="$ucx_inst/bin/ucx_perftest"
ucx_perftest_daemon="$ucx_inst/bin/ucx_perftest_daemon"
ucp_test_args="-b $ucx_inst_ptest/test_types_ucp_daemon"
devices="$(get_ib_bf_devices $(get_active_ib_devices))"
for ucx_dev in $devices
do
echo "==== Running ucx_perftest over a daemon on $ucx_dev ===="
ip_addr=$(get_rdma_device_ip_addr $ucx_dev)
if [ -z "$ip_addr" ]
then
echo "Cannot find IPv4 address for device $ucx_dev"
continue
fi
export UCX_NET_DEVICES=$ucx_dev
# Start client and server daemons
start_perftest_daemon $ucx_perftest_daemon server_dmn_pid server_dmn_port
start_perftest_daemon $ucx_perftest_daemon client_dmn_pid client_dmn_port
ucp_client_args="-g $ip_addr:$client_dmn_port -G $ip_addr:$server_dmn_port $(hostname)"
run_client_server_app "$ucx_perftest" "$ucp_test_args" "$ucp_client_args" 0 0
kill ${client_dmn_pid} || true # ignore failure
kill ${server_dmn_pid} || true # ignore failure
wait $client_dmn_pid || true
wait $server_dmn_pid || true
unset UCX_TLS
unset UCX_NET_DEVICES
done
}
#
# Test malloc hooks with mpi
#
test_malloc_hooks_mpi() {
for mode in reloc bistro
do
for tname in malloc_hooks malloc_hooks_unmapped external_events flag_no_install
do
echo "==== Running memory hook (${tname} mode ${mode}) on MPI ===="
which mpirun
$MPIRUN -np 1 \
./test/mpi/test_memhooks -t $tname -m ${mode}
done
echo "==== Running memory hook (malloc_hooks mode ${mode}) on MPI with LD_PRELOAD ===="
ucm_lib=$PWD/src/ucm/.libs/libucm.so
ls -l $ucm_lib
which mpirun
$MPIRUN -np 1 -x LD_PRELOAD=$ucm_lib \
./test/mpi/test_memhooks -t malloc_hooks -m ${mode}
done
}
#
# Run tests with MPI library
#
run_mpi_tests() {
prev_LD_LIBRARY_PATH=$LD_LIBRARY_PATH
mpi_module=hpcx-ga-gcc
if module_load ${mpi_module}
then
if mpirun --version
then
# Prevent our tests from using UCX libraries from hpcx module by prepending
# our local library path first
save_LD_LIBRARY_PATH=${LD_LIBRARY_PATH}
export LD_LIBRARY_PATH=${ucx_inst}/lib:${MPI_HOME}/lib:${prev_LD_LIBRARY_PATH}
build release --disable-gtest --with-mpi
# check whether installation is valid (it compiles examples at least)
$MAKEP installcheck
MPIRUN="mpirun \
--allow-run-as-root \
--bind-to none \
-x UCX_ERROR_SIGNALS \
-x UCX_HANDLE_ERRORS \
-mca pml ob1 \
-mca osc ^ucx \
-mca btl tcp,self \
-mca btl_tcp_if_include lo \
-mca orte_allowed_exit_without_sync 1 \
-mca coll ^hcoll,ml"
run_ucx_perftest 1
test_malloc_hooks_mpi
# Restore LD_LIBRARY_PATH so subsequent tests will not take UCX libs
# from installation directory
export LD_LIBRARY_PATH=${save_LD_LIBRARY_PATH}
make_clean distclean
else
echo "==== Not running MPI tests ===="
fi
module unload ${mpi_module}
else
echo "==== Not running MPI tests ===="
fi
}
build_ucx_profiling_test() {
# compile the profiling example code
gcc -o ucx_profiling ../test/apps/profiling/ucx_profiling.c \
-lm -lucs -I${ucx_inst}/include -L${ucx_inst}/lib -Wl,-rpath=${ucx_inst}/lib
}
#
# Test profiling infrastructure
#
test_profiling() {
echo "==== Running profiling example ===="
build_ucx_profiling_test
UCX_PROFILE_MODE=log UCX_PROFILE_FILE=ucx_jenkins.prof ./ucx_profiling
UCX_READ_PROFILE=${ucx_inst}/bin/ucx_read_profile
$UCX_READ_PROFILE -r ucx_jenkins.prof | grep "printf" -C 20
$UCX_READ_PROFILE -r ucx_jenkins.prof | grep -q "calc_pi"
$UCX_READ_PROFILE -r ucx_jenkins.prof | grep -q "print_pi"
}
test_ucs_load() {
if [ -z "${have_ptrace}" ] || [ -z "${have_strace}" ]
then
log_warning "==== Not running UCS library loading test ===="
return
fi
build_ucx_profiling_test
# Make sure UCS library constructor does not call socket()
echo "==== Running UCS library loading test ===="
strace ./ucx_profiling &> strace.log
! grep '^socket' strace.log
}
test_ucs_dlopen() {
$MAKEP
# Make sure UCM is not unloaded
echo "==== Running UCS dlopen test with memhooks ===="
./test/apps/test_ucs_dlopen
# Test global config list integrity after loading/unloading of UCT
echo "==== Running test_dlopen_cfg_print ===="
./test/apps/test_dlopen_cfg_print
}
test_ucp_dlopen() {
# Make sure UCP library, when opened with dlopen(), loads CMA module
LIB_CMA=`find ${ucx_inst} -name libuct_cma.so.0`
if [ -n "$LIB_CMA" ]
then
echo "==== Running UCP library loading test ===="
./test/apps/test_ucp_dlopen | grep 'cma'
else
echo "==== Not running UCP library loading test ===="
fi
# Test module allow-list
UCX_MODULES=^ib,rdmacm ./src/tools/info/ucx_info -d |& tee ucx_info_noib.log
if grep -in "component:\s*ib$" ucx_info_noib.log
then
echo "IB module was loaded even though it was disabled"
exit 1
fi
# Test module allow-list passed through ucp_config_modify()
./test/apps/test_ucp_config -c "UCX_MODULES=^ib,rdmacm" |& tee ucx_config_noib.log
if grep -in "component:\s*ib$" ucx_config_noib.log
then
echo "IB module was loaded even though it was disabled"
exit 1
fi
}
test_ucm_hooks() {
total=30
echo "==== Running UCM Bistro hook test ===="
for i in $(seq 1 $total); do
threads=$(((RANDOM % (2 * `nproc`)) + 1))
echo "iteration $i/$total: $threads threads"
timeout 10 ./test/apps/test_hooks -n $threads >test_hooks.log 2>&1 || \
{ \
echo "ERROR running bistro hook test:"; \
cat test_hooks.log; \
exit 1; \
}
done
echo "SUCCESS running bistro hook test:"
cat test_hooks.log
}
test_init_mt() {
echo "==== Running multi-thread init ===="
# Each thread requires 5MB. Cap threads number by total available shared memory.
max_threads=$(df /dev/shm | awk '/shm/ {printf "%d", $4 / 5000}')
num_threads=$(($max_threads < $(nproc) ? $max_threads : $(nproc)))
$MAKEP
for ((i=0;i<10;++i))
do
OMP_NUM_THREADS=$num_threads timeout 5m ./test/apps/test_init_mt
done
}
test_memtrack() {
echo "==== Running memtrack test ===="
UCX_MEMTRACK_DEST=stdout GTEST_FILTER=test_memtrack.sanity make -C ./test/gtest test
echo "==== Running memtrack limit test ===="
UCX_MEMTRACK_DEST=stdout UCX_HANDLE_ERRORS=none UCX_MEMTRACK_LIMIT=512MB ./test/apps/test_memtrack_limit |& grep -C 100 'SUCCESS'
UCX_MEMTRACK_DEST=stdout UCX_HANDLE_ERRORS=none UCX_MEMTRACK_LIMIT=412MB ./test/apps/test_memtrack_limit |& grep -C 100 'reached'
}
test_unused_env_var() {
# We must create a UCP worker to get the warning about unused variables
echo "==== Running ucx_info env vars test ===="
UCX_IB_PORTS=mlx5_0:1 ./src/tools/info/ucx_info -epw -u t | grep "unused" | grep -q -E "UCX_IB_PORTS"
# Check that suggestions for similar ucx env vars are printed
echo "==== Running fuzzy match test ===="
../test/apps/test_fuzzy_match.py --ucx_info ./src/tools/info/ucx_info
}
test_env_var_aliases() {
echo "==== Running MLX5 env var aliases test ===="
if [[ `./src/tools/info/ucx_info -b | grep -P 'HW_TM *1$'` ]]
then
vars=( "TM_ENABLE" "TM_LIST_SIZE" "TX_MAX_BB" )
for var in "${vars[@]}"
do
for tl in "RC_MLX5" "DC_MLX5"
do
val=$(./src/tools/info/ucx_info -c | grep "${tl}_${var}" | cut -d'=' -f2)
if [ -z $val ]
then
echo "UCX_${tl}_${var} does not exist in UCX config"
exit 1
fi
# To check that changing env var takes an effect,
# create some value, which is different from the default.
magic_val=`echo $val | sed -e ' s/inf\|auto/15/; s/n/swap/; s/y/n/; s/swap/y/; s/\([0-9]\)/\11/'`
# Check that both (tl name and common RC) aliases work
for var_alias in "RC" $tl
do
var_name=UCX_${var_alias}_${var}
val_set=$(export $var_name=$magic_val; ./src/tools/info/ucx_info -c | grep "${tl}_${var}" | cut -d'=' -f2)
if [ "$val_set" != "$magic_val" ]
then
echo "Can't set $var_name"
exit 1
fi
done
done
done
else
echo "HW TM is not compiled in UCX"
fi
}
test_malloc_hook() {
echo "==== Running malloc hooks test ===="
if [ -x ./test/apps/test_tcmalloc ]
then
./test/apps/test_tcmalloc
fi
}
test_no_cuda_context() {
echo "==== Running no CUDA context test ===="
if [ "X$have_cuda" == "Xyes" ] && [ -x ./test/apps/test_no_cuda_ctx ]
then
./test/apps/test_no_cuda_ctx
fi
}
run_gtest_watchdog_test() {
watchdog_timeout=$1
sleep_time=$2
expected_runtime=$3
expected_err_str="Connection timed out - abort testing"
echo "==== Running watchdog timeout test ===="
start_time=`date +%s`
env WATCHDOG_GTEST_TIMEOUT_=$watchdog_timeout \
WATCHDOG_GTEST_SLEEP_TIME_=$sleep_time \
GTEST_FILTER=test_watchdog.watchdog_timeout \
make -C ./test/gtest test 2>&1 | tee watchdog_timeout_test &
pid=$!
wait $pid
end_time=`date +%s`
res="$(grep -x "$expected_err_str" watchdog_timeout_test)" || true
rm -f watchdog_timeout_test
if [ "$res" != "$expected_err_str" ]
then
echo "didn't find [$expected_err_str] string in the test output"
exit 1
fi
runtime=$(($end_time-$start_time))
if [ $runtime -gt $expected_runtime ]
then
echo "Watchdog timeout test takes $runtime seconds that" \
"is greater than expected $expected_runtime seconds"
exit 1
fi
}
run_malloc_hook_gtest() {
# GTEST_SHARD_INDEX/GTEST_TOTAL_SHARDS should NOT be set
echo "==== Running malloc hooks mallopt() test, $compiler_name compiler ===="
$TIMEOUT env \
UCX_IB_RCACHE=n \
MALLOC_TRIM_THRESHOLD_=-1 \
MALLOC_MMAP_THRESHOLD_=-1 \
GTEST_FILTER=malloc_hook_cplusplus.mallopt \
make -C test/gtest test
echo "==== Running malloc hooks mmap_ptrs test with MMAP_THRESHOLD=16384, $compiler_name compiler ===="
$TIMEOUT env \
MALLOC_MMAP_THRESHOLD_=16384 \
GTEST_FILTER=malloc_hook_cplusplus.mmap_ptrs \
make -C test/gtest test
echo "==== Running cuda hooks, $compiler_name compiler ===="
$TIMEOUT env \
GTEST_FILTER='cuda_hooks.*' \
make -C test/gtest test
echo "==== Running cuda hooks with far jump, $compiler_name compiler ===="
$TIMEOUT env \
UCX_MEM_BISTRO_FORCE_FAR_JUMP=y \
GTEST_FILTER='cuda_hooks.*' \
make -C test/gtest test
}
set_gtest_common_test_flags() {
export GTEST_RANDOM_SEED=0
export GTEST_SHUFFLE=1
# Run UCT tests for TCP over fastest device only
export GTEST_UCT_TCP_FASTEST_DEV=1
export OMP_NUM_THREADS=4
}
set_gtest_make_test_flags() {
set_gtest_common_test_flags
# Distribute the tests among the workers
export GTEST_SHARD_INDEX=$worker
export GTEST_TOTAL_SHARDS=$nworkers
# Report TOP-20 longest test at the end of testing
export GTEST_REPORT_LONGEST_TESTS=20
GTEST_EXTRA_ARGS=""
if [ "$TEST_PERF" == 1 ] && [[ "$VALGRIND_CHECK" != "yes" ]]
then
# Check performance with 10 retries and 2 seconds interval
GTEST_EXTRA_ARGS="$GTEST_EXTRA_ARGS -p 10 -i 2.0"
fi
export GTEST_EXTRA_ARGS
}
unset_test_flags() {
unset OMP_NUM_THREADS
unset GTEST_EXTRA_ARGS
unset GTEST_REPORT_LONGEST_TESTS
unset GTEST_TOTAL_SHARDS
unset GTEST_SHARD_INDEX
unset GTEST_UCT_TCP_FASTEST_DEV
unset GTEST_SHUFFLE
unset GTEST_RANDOM_SEED
}
run_specific_tests() {
set_gtest_common_test_flags
# Run specific tests