TL;DR sge代码里使用timeval结构体时,没有初始化成员的值,导致timeout时间非常大,jsv进程一直没有timeout退出。
问题描述
用户反馈sge的jsv进程一直卡死,如下面所示,15827、14992、15109、8805等进程应该很快就会结束,而不是一直运行着
# pstree -p | grep -i jsv2
|-sjm(402)-+-jsv2.sh(15827)---jsv2.sh(2855)---cut(2857)
|-sjm(30992)-+-jsv2.sh(14992)---jsv2.sh(2849)---cut(2851)
|-sjm(31662)-+-jsv2.sh(15109)---jsv2.sh(2846)---cut(2848)
|-sjm(12019)-+-jsv2.sh(8805)
问题分析
先用pstack
看进程的情况
# pstack 12019
... skipped ....
Thread 1 (Thread 0x7f802063d840 (LWP 12019)):
#0 0x0000003a9f4bd79d in nanosleep () from /lib64/libc.so.6
#1 0x00007f80223d5e1e in sge_usleep () from /opt/gridengine/lib/lx-amd64/libdrmaa.so.1.0
#2 0x00007f80223d135e in sge_peclose () from /opt/gridengine/lib/lx-amd64/libdrmaa.so.1.0
#3 0x00007f8022333d0e in jsv_stop () from /opt/gridengine/lib/lx-amd64/libdrmaa.so.1.0
#4 0x00007f80223360d9 in jsv_do_communication () from /opt/gridengine/lib/lx-amd64/libdrmaa.so.1.0
#5 0x00007f8022334dae in jsv_do_verify () from /opt/gridengine/lib/lx-amd64/libdrmaa.so.1.0
#6 0x00007f80222efe05 in sge_gdi_packet_execute_external () from /opt/gridengine/lib/lx-amd64/libdrmaa.so.1.0
#7 0x00007f80222ed007 in sge_gdi2_multi () from /opt/gridengine/lib/lx-amd64/libdrmaa.so.1.0
#8 0x00007f80222ed6fb in sge_gdi2 () from /opt/gridengine/lib/lx-amd64/libdrmaa.so.1.0
#9 0x00007f80223df189 in japi_send_job () from /opt/gridengine/lib/lx-amd64/libdrmaa.so.1.0
#10 0x00007f80223e8292 in japi_run_job () from /opt/gridengine/lib/lx-amd64/libdrmaa.so.1.0
#11 0x00007f80222d4442 in drmaa_run_job () from /opt/gridengine/lib/lx-amd64/libdrmaa.so.1.0
#12 0x000000000042c375 in SgeBatchSystem::submit(Job&) ()
#13 0x0000000000424d12 in JobMonitor::submitJob() ()
#14 0x00000000004279e8 in JobMonitor::processEvents() ()
#15 0x0000000000427dad in JobMonitor::submit(std::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, bool, std::vector<std::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&) ()
#16 0x000000000040a2d0 in main ()
可以看到,Thread 1一直停在nanosleep()进程。
使用gdb attach用户进程(注:由于权限原因,当前系统只能以用户权限去attach)
# sudo -u <user_name> gdb -p 12019
(gdb) bt full
#0 0x0000003a9f4bd79d in nanosleep () from /lib64/libc.so.6
No symbol table info available.
#1 0x00007f80223d5e1e in sge_usleep (sleep_time=<value optimized out>) at ../libs/uti/sge_time.c:475
req = {tv_sec = 0, tv_nsec = 100000000}
rem = {tv_sec = 0, tv_nsec = 95651751}
#2 0x00007f80223d135e in sge_peclose (pid=8805, fp_in=<value optimized out>, fp_out=<value optimized out>, fp_err=<value optimized out>,
timeout=0x7ffda3d2ce80) at ../libs/uti/sge_stdio.c:681
sleep_for = 100000
i = <value optimized out>
timeout_us = 140099372198732
SGE_FUNC = "sge_peclose"
#3 0x00007f8022333d0e in jsv_stop (jsv=0xa862b0, answer_list=<value optimized out>, try_soft_quit=<value optimized out>) at ../libs/sgeobj/sge_jsv.c:393
scriptfile = 0xa7d810 "/opt/gridengine/default/common/jsv2.sh"
t = {tv_sec = 0, tv_usec = 140188306598732}
pid = 8805
SGE_FUNC = "jsv_stop"
#4 0x00007f80223360d9 in jsv_do_communication (ctx=0xa75270, jsv=0xa862b0, answer_list=<value optimized out>) at ../libs/sgeobj/sge_jsv_script.c:2574
start_time = 1588999988
do_retry = true
jsv_timeout = 10
ret = true
input = "STARTED\000TATE ACCEPT Job is accepted", '\000' <repeats 197 times>, "Ч{\237:\000\000\000\321.-\\\002\200\377\377\060\321ң\375\177\000\000/\321ң\375\177", '\000' <repeats 82 times>"\200, \247{\237:\000\000\000\020\004\000\000\000\000\000\000\000\004\000\000\000\000\000\000\020T\250\000\000\000\000\000@3\252\000\000\000\000\000\b\a\001\000\000\000\000\000y\344G\237:\000\000\000\000\004\000\000\000\000\000\000\361--\\\002\200\377\377A\000\000\000\000\000\000\000\020\000\000\000\000\000\000\000@\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000\020\322ң\375\177\000\000\000\000\000\000\000\000\000\000]\000\000\000n", '\000' <repeats 19 times>, "w\000\000\000|\000\000\000"...
SGE_FUNC = "jsv_do_communication"
#5 0x00007f8022334dae in jsv_do_verify (ctx=0xa75270, context=<value optimized out>, job=0x7ffda3d2f8e8, answer_list=0x7ffda3d2fab8, holding_lock=false)
at ../libs/sgeobj/sge_jsv.c:1095
old_job = <value optimized out>
new_job = 0xa9b770
jsv_url = 0x0
holding_mutex = <value optimized out>
ret = <value optimized out>
jsv = 0xa862b0
jsv_next = 0x0
iterator = 0x0
SGE_FUNC = "jsv_do_verify"
... skipped ...
从frame 3可以看到jsv执行jsv_stop()
去停止jsv实例,代码里函数的描述是这样的
// source/libs/sgeobj/sge_jsv.c
/****** sge_jsv/jsv_stop() *****************************************************
* NAME
* jsv_stop() -- Stop a JSV instance which was previously started-
*
* SYNOPSIS
* bool jsv_stop(lListElem *jsv, lList **answer_list, bool try_soft_quit)-
*
* FUNCTION
* Stop a running JSV instance which was previously started. If the
* variable 'try_soft_quit' is set to 'true' then this function tries
* to communicate with the process. It will then send a "QUIT" string
* to pipe end which is connected with the stdin of the JSV process.
* If the script does then not terminate within JSV_QUIT_TIMEOUT-
* seconds then it will be terminated hard via kill signal.
下面是实际代码部分
// source/libs/sgeobj/sge_jsv.c
bool
jsv_stop(lListElem *jsv, lList **answer_list, bool try_soft_quit) {
bool ret = true;
pid_t pid = -1;
DENTER(TOP_LAYER, "jsv_stop");
/* stop is only possible if it was started before */
pid = jsv_get_pid(jsv);
if (pid != -1) {
const char *scriptfile = lGetString(jsv, JSV_command);
struct timeval t;
/*
* send a quit command to stop the JSV softly.
* give it then JSV_QUIT_TIMEOUT seconds before
* the kill is triggered (sge_peclose)
*/
if (try_soft_quit) { // 大部分情况下,都是try_soft_quit=true,发送QUIT给jsv实例
jsv_send_command(jsv, answer_list, "QUIT");
t.tv_sec = JSV_QUIT_TIMEOUT;
} else { // 但这里的情况是不发送QUIT,下面会详细说明
t.tv_sec = 0;
}
/*
* kill the JSV process
*/
sge_peclose(pid, lGetRef(jsv, JSV_in), // sge_peclose会发送SIGKILL给jsv实例
lGetRef(jsv, JSV_out), lGetRef(jsv, JSV_err), &t);
INFO((SGE_EVENT, MSG_JSV_STOPPED_S, scriptfile));
jsv_set_pid(jsv, -1);
}
DRETURN(ret);
}
在frame 3中,有几个重要信息。pid 8805就是我们认为应该运行很长时间的jsv脚本/opt/gridengine/default/common/jsv2.sh
,而t
是传给sge_peclose()
函数的timeout
参数。
scriptfile = 0xa7d810 "/opt/gridengine/default/common/jsv2.sh"
t = {tv_sec = 0, tv_usec = 140188306598732}
pid = 8805
再看frame 2里sge_peclose()
函数的代码,前面部分首先是初始化timeout_us的值。这里timeout参数就是上面看到的,timeout->tv_sec=0,timeout->tv_usec=140188306598732,这是一个很大的数。
// source/libs/uti/sge_stdio.c
int sge_peclose(pid_t pid, FILE *fp_in, FILE *fp_out, FILE *fp_err,
struct timeval *timeout)
{
int i, status;
long timeout_us = timeout ? (timeout->tv_sec * 1000000L + timeout->tv_usec)
: 0; // 初始值timeout_us = timeout->tv_usec
DENTER(TOP_LAYER, "sge_peclose");
if (fp_in != NULL) { // 到jsv实例的输入、输出都关闭了
FCLOSE(fp_in);
}
if (fp_out != NULL) {
FCLOSE(fp_out);
}
if (fp_err != NULL) {
FCLOSE(fp_err);
}
由于输入、输出关闭,strace
jsv实例,会看到实例在发送数据的时候报EPIPE错误
# strace -f -p 8805 -e write 2>&1 -s 128 | egrep '^write'
write(1, "ERROR JSV script got unknown command \"\"\n", 40) = -1 EPIPE (Broken pipe)
write(1, "ERROR JSV script got unknown command \"\"\n", 40) = -1 EPIPE (Broken pipe)
write(1, "ERROR JSV script got unknown command \"\"\n", 40) = -1 EPIPE (Broken pipe)
这对应的是${SGE_ROOT}/util/resources/jsv/jsv_include.sh
脚本里1723行代码,有兴趣可以了解一下,这里不展开。(注:当前系统sge版本为Son of Grid Engine最新版本)
接着说sge_peclose()
函数。这里是一个while循环。
// source/libs/uti/sge_stdio.c
do {
errno = 0;
i = waitpid(pid, &status, timeout?WNOHANG:0); // wait4系统调用
if (i==-1) {
DEXIT;
return errno;
}
if (i==0) { /* not yet exited */
if (timeout_us <= 0) { // timeout_us是一个非常大的值
#ifdef WIN32 /* kill not called */
DPRINTF(("killing not yet implemented\n"));
timeout = NULL;
/* kill(pid, SIGKILL); */
#else
DPRINTF(("killing\n"));
timeout = NULL;
kill(pid, SIGKILL);
#endif /* WIN32 */
} else {
/* Spin less for longer timeouts; for timeout > 1s, sleep
for 0.1s, n the loop, otherwise 1ms. */
int sleep_for = (timeout_us > 1000000) ? 100000 : 1000;
DPRINTF(("%f seconds waiting for exit\n",
timeout->tv_sec + timeout->tv_usec / 1e6));
timeout_us -= sleep_for; // timeout_us在这里调低,但要降到<=0需时很长
sge_usleep(sleep_for); // nanosleep系统调用
}
}
} while (i != pid);
如果直接去strace
12019这个用户程序的进程,可以看到在进行wait4()
和nanosleep()
两个系统调用,对应的就是上面的waitpid()
和sge_usleep()
在进行底层的调用。
# strace -p 12019
Process 12019 attached
restart_syscall(<... resuming interrupted call ...>) = 0
wait4(8805, 0x7ffda3d2ce3c, WNOHANG, NULL) = 0
nanosleep({0, 100000000}, 0x7ffda3d2cdd0) = 0
wait4(8805, 0x7ffda3d2ce3c, WNOHANG, NULL) = 0
nanosleep({0, 100000000}, 0x7ffda3d2cdd0) = 0
wait4(8805, 0x7ffda3d2ce3c, WNOHANG, NULL) = 0
nanosleep({0, 100000000}, 0x7ffda3d2cdd0) = 0
wait4(8805, 0x7ffda3d2ce3c, WNOHANG, NULL) = 0
到这里我们就知道进程为什么不退出了,因为timeout_us是一个很大的值,不小于等于0,所以sge_peclose()
函数并没有执行kill(pid, SIGKILL)
,而是不断在执行下面这两步。
timeout_us -= sleep_for;
sge_usleep(sleep_for);
从gdb可以看到timeout_us的值要减到小于等于0是要很长时间的。
(gdb) frame 2
#2 0x00007f80223d135e in sge_peclose (pid=8805, fp_in=<value optimized out>, fp_out=<value optimized out>, fp_err=<value optimized out>,
timeout=0x7ffda3d2ce80) at ../libs/uti/sge_stdio.c:681
681 sge_usleep(sleep_for);
(gdb) info locals
sleep_for = 100000
i = <value optimized out>
timeout_us = 140099372198732
SGE_FUNC = "sge_peclose"
- 为什么会出现这个问题?
主要是因为jsv_stop()
函数里t
这个值没有初始化,代码里只设置了t.tv_sec
,没有设置t.tv_usec
这个值,这会导致不确定的结果,也就是这里看到的超大值。
// linux/latest/source/include/uapi/linux/time.h
struct timeval {
__kernel_old_time_t tv_sec; /* seconds */
__kernel_suseconds_t tv_usec; /* microseconds */
};
// source/libs/sgeobj/sge_jsv.c
struct timeval t;
/*
* send a quit command to stop the JSV softly.
* give it then JSV_QUIT_TIMEOUT seconds before
* the kill is triggered (sge_peclose)
*/
if (try_soft_quit) { // 大部分情况下,都是try_soft_quit=true,发送QUIT给jsv实例
jsv_send_command(jsv, answer_list, "QUIT");
t.tv_sec = JSV_QUIT_TIMEOUT;
} else { // 但这里的情况是不发送QUIT,下面会详细说明
t.tv_sec = 0;
}
- 为什么有些作业有问题,有些作业有问题?
我们再去看成backtrace的输出,frame 4是在../libs/sgeobj/sge_jsv_script.c:2574
这一行,do_retry = true
,jsv_timeout = 10
。
#4 0x00007f80223360d9 in jsv_do_communication (ctx=0xa75270, jsv=0xa862b0, answer_list=<value optimized out>) at ../libs/sgeobj/sge_jsv_script.c:2574
start_time = 1588999988
do_retry = true
jsv_timeout = 10
ret = true
input = "STARTED\000TATE ACCEPT Job is accepted", '\000' <repeats 197 times>, "Ч{\237:\000\000\000\321.-\\\002\200\377\377\060\321ң\375\177\000\000/\321ң\375\177", '\000' <repeats 82 times>"\200, \247{\237:\000\000\000\020\004\000\000\000\000\000\000\000\004\000\000\000\000\000\000\020T\250\000\000\000\000\000@3\252\000\000\000\000\000\b\a\001\000\000\000\000\000y\344G\237:\000\000\000\000\004\000\000\000\000\000\000\361--\\\002\200\377\377A\000\000\000\000\000\000\000\020\000\000\000\000\000\000\000@\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000\020\322ң\375\177\000\000\000\000\000\000\000\000\000\000]\000\000\000n", '\000' <repeats 19 times>, "w\000\000\000|\000\000\000"...
SGE_FUNC = "jsv_do_communication"
对着代码看,要执行2574这行,有两个条件:
sge_get_gmt() - start_time > jsv_timeout
,就是jsv运行时长已经超过jsv_timeout时间还没有结束(注:默认10秒)- 超时后第一次
do_retry
。这是第一次也是唯一一次重试的机会。
// source/libs/sgeobj/sge_jsv_script.c
2559 while (!lGetBool(jsv, JSV_done)) { // 不断重试,间隔10000微秒
2560 if (sge_get_gmt() - start_time > jsv_timeout) { // 直至超时执行下面的代码
2561 DPRINTF(("JSV - master waited longer than %d s to get response from JSV\n", jsv_timeout));
2562 /*
2563 * In case of a timeout we try it a second time. In that case we kill
2564 * the old instance and start a new one before we continue
2565 * with the verification. Otherwise we report an error which will
2566 * automatically reject the job which should be verified.
2567 */
2568 if (do_retry) { // do_retry初始值是true,这里会重试一次
2569 DPRINTF(("JSV - will retry verification\n"));-
2570 lSetBool(jsv, JSV_restart, false);
2571 lSetBool(jsv, JSV_accept, false);
2572 lSetBool(jsv, JSV_done, false);
2573 DPRINTF(("JSV process will be stopped now\n"));
2574 ret &= jsv_stop(jsv, answer_list, false); // 卡在这里,最后参数为false,导致不发送QUIT给jsv
2575 if (ret) {
2576 DPRINTF(("JSV process will be started now\n"));
2577 ret &= jsv_start(jsv, answer_list);
2578 }
2579 if (ret) {
2580 DPRINTF(("JSV process gets START command\n"));
2581 while (fscanf(lGetRef(jsv, JSV_err), "%[^\n]\n", input) == 1) {
2582 ERROR((SGE_EVENT, MSG_JSV_LOGMSG_S, input));--
2583 }
2584 ret &= jsv_send_command(jsv, answer_list, "START");
2585 }
2586 start_time = sge_get_gmt();
2587 do_retry = false;
所以,如果jsv执行得快的话,就不会有问题。目前发现有问题的都是通过drmaa投递的,猜测可能的情况时,自动化脚本短时间内会投递大量作业,导致性能问题,jsv执行慢,出现超时现象,然后导致这些作业投递失败。
解决方案
纵观代码,总共有两处地方传递了timeout
参数给sge_peclose()
,而没有初始化tv_usec
的值。代码本来并没有想着要用tv_usec
的值,但commit 227a8445修改了sge_peclose()
函数sleep代码的逻辑,使用了tv_usec
的值。
所以,代码最终修改如下,涉及两个文件:
# diff -p -r sge-master.orig/ sge-master/
diff -p -r sge-master.orig/source/daemons/execd/sge_load_sensor.c sge-master/source/daemons/execd/sge_load_sensor.c
*** sge-master.orig/source/daemons/execd/sge_load_sensor.c 2019-08-13 00:54:13.000000000 +0800
--- sge-master/source/daemons/execd/sge_load_sensor.c 2020-05-11 10:31:58.928879447 +0800
*************** static void sge_ls_stop_ls(lListElem *th
*** 385,390 ****
--- 385,396 ----
ret = LS_BROKEN_PIPE;
}
+ /* fix uninitialized t.tv_usec cause undefined behaviour bug.
+ * commit 227a8445 replace sleep() with higher resolution sge_usleep().
+ * it use not only t.tv_sec, but also t.tv_usec.
+ */
+ t.tv_usec = 0;
+
memset(&t, 0, sizeof(t));
if (ret == LS_OK) {
t.tv_sec = LS_QUIT_TIMEOUT;
diff -p -r sge-master.orig/source/libs/sgeobj/sge_jsv.c sge-master/source/libs/sgeobj/sge_jsv.c
*** sge-master.orig/source/libs/sgeobj/sge_jsv.c 2019-08-13 00:54:13.000000000 +0800
--- sge-master/source/libs/sgeobj/sge_jsv.c 2020-05-11 10:28:46.054555752 +0800
*************** jsv_stop(lListElem *jsv, lList **answer_
*** 375,380 ****
--- 375,386 ----
const char *scriptfile = lGetString(jsv, JSV_command);
struct timeval t;
+ /* fix uninitialized t.tv_usec cause undefined behaviour bug.
+ * commit 227a8445 replace sleep() with higher resolution sge_usleep().
+ * it use not only t.tv_sec, but also t.tv_usec.
+ */
+ t.tv_usec = 0;
+
/*
* send a quit command to stop the JSV softly.
* give it then JSV_QUIT_TIMEOUT seconds before
重新编译sge,并替换libdrmaa.so.1.0文件。
总结
sge代码的bug是主要的原因,但jsv的效率也需要做一下性能测试。理论上目前jsv脚本的操作很简单,不应该耗时长,但还是要有实际数据来佐证。