TL;DR bash参数扩展导致命令输出跟预期结果不一致。
问题描述
用户反馈echo
或者printf
命令如下,当有1或者g的时候,只会输出1或/和g,没有其他数据,怀疑系统有问题。
$ printf [3jnrw:mvlewgtf09u23434e12ermld234r25cscfw]
1
$ echo [3jnrw:mvlewgtf09u23434e12ermld234r25cscfw]
1 g
$ echo [adfaasdljfalsjljajzpqwekzv]
[adfaasdljfalsjljajzpqwekzv]
$ uname -r
2.6.32-504.16.2.el6.x86_64
问题分析
使用strace
命令去跟踪,发现bash在执行execve系统调用的时候,传递给echo的就是只有1和g。问题应该在bash。
# strace -f echo [aaag1bbb]
execve("/bin/echo", ["echo", "1", "g"], [/* 53 vars */]) = 0
接下来用gdb来分析bash的行为。由于只知道bash在执行execve的时候,传递给echo命令的参数不对,所以只设置捕获execve系统调用。
# gdb --args /bin/bash -c '/bin/echo [g1]'
(gdb) catch syscall execve
Catchpoint 1 (syscall 'execve' [59])
运行程序,查看断点的backtrace信息
(gdb) r
Starting program: /bin/bash -c /bin/echo\ \[g1\]
Catchpoint 1 (call to syscall 'execve'), 0x0000003047ebdb37 in execve () from /lib64/libc.so.6
Missing separate debuginfos, use: debuginfo-install ncurses-libs-5.7-4.20090207.el6.x86_64
(gdb) bt
#0 0x0000003047ebdb37 in execve () from /lib64/libc.so.6
#1 0x000000000042e156 in shell_execve (command=0x8ee0e0 "/bin/echo", args=0x8eed00, env=0x8ee9e0) at execute_cmd.c:4816
#2 0x000000000042fade in execute_disk_command (simple_command=<value optimized out>, pipe_in=-1, pipe_out=-1, async=<value optimized out>,
fds_to_close=<value optimized out>) at execute_cmd.c:4616
#3 execute_simple_command (simple_command=<value optimized out>, pipe_in=-1, pipe_out=-1, async=<value optimized out>, fds_to_close=<value optimized out>)
at execute_cmd.c:3863
#4 0x0000000000430463 in execute_command_internal (command=0x8ee6f0, asynchronous=<value optimized out>, pipe_in=-1, pipe_out=-1, fds_to_close=0x8ee740)
at execute_cmd.c:741
#5 0x000000000046b86b in parse_and_execute (string=<value optimized out>, from_file=0x4a0fb8 "-c", flags=<value optimized out>) at evalstring.c:339
#6 0x000000000041bada in run_one_command (command=0x7fffffffdfef "/bin/echo [g1]") at shell.c:1325
#7 0x000000000041cbab in main (argc=<value optimized out>, argv=0x7fffffffdcb8, env=0x7fffffffdcd8) at shell.c:698
先看frame 1的信息,发现shell_execve()
函数的参数args是有问题的。不应该是1
和g
,而应该是[g1]
。
(gdb) frame 1
#1 0x000000000042e156 in shell_execve (command=0x8ee0e0 "/bin/echo", args=0x8eed00, env=0x8ee9e0) at execute_cmd.c:4816
4816 execve (command, args, env);
(gdb) list execute_cmd.c:4816
4811 int larray, i, fd;
4812 char sample[80];
4813 int sample_len;
4814
4815 SETOSTYPE (0); /* Some systems use for USG/POSIX semantics */
4816 execve (command, args, env);
4817 i = errno; /* error from execve() */
4818 CHECK_TERMSIG;
4819 SETOSTYPE (1);
4820
(gdb) set listsize 20
(gdb) list execute_cmd.c:4816
4806 int
4807 shell_execve (command, args, env)
4808 char *command;
4809 char **args, **env;
4810 {
4811 int larray, i, fd;
4812 char sample[80];
4813 int sample_len;
4814
4815 SETOSTYPE (0); /* Some systems use for USG/POSIX semantics */
4816 execve (command, args, env);
4817 i = errno; /* error from execve() */
4818 CHECK_TERMSIG;
4819 SETOSTYPE (1);
4820
4821 /* If we get to this point, then start checking out the file.
4822 Maybe it is something we can hack ourselves. */
4823 if (i != ENOEXEC)
4824 {
4825 if (file_isdir (command))
(gdb) p args
$2 = (char **) 0x8eed00
(gdb) p *args
$3 = 0x8ee880 "/bin/echo"
(gdb) p *args@4
$8 = {0x8ee880 "/bin/echo", 0x8ee940 "1", 0x8ee100 "g", 0x0}
再往上看frame 2,传递的args参数被optimized out了,看不了。但可以看到args是由words来构成的,而words在这里是有问题的。
(gdb) frame 2
#2 0x000000000042fade in execute_disk_command (simple_command=<value optimized out>, pipe_in=-1, pipe_out=-1, async=<value optimized out>,
fds_to_close=<value optimized out>) at execute_cmd.c:4616
4616 exit (shell_execve (command, args, export_env));
(gdb) p args
$11 = <value optimized out>
(gdb) list execute_cmd.c:4616
4606 }
4607
4608 wl = make_word_list (make_word (NOTFOUND_HOOK), words);
4609 exit (execute_shell_function (hookf, wl));
4610 }
4611
4612 /* Execve expects the command name to be in args[0]. So we
4613 leave it there, in the same format that the user used to
4614 type it in. */
4615 args = strvec_from_word_list (words, 0, 0, (int *)NULL);
4616 exit (shell_execve (command, args, export_env));
4617 }
4618 else
4619 {
4620 parent_return:
4621 /* Make sure that the pipes are closed in the parent. */
4622 close_pipes (pipe_in, pipe_out);
4623 #if defined (PROCESS_SUBSTITUTION) && defined (HAVE_DEV_FD)
4624 if (variable_context == 0)
4625 unlink_fifo_list ();
(gdb) p words
$25 = (WORD_LIST *) 0x8ee960
(gdb) p *words
$26 = {next = 0x8ee8a0, word = 0x8ebf30}
(gdb) p *words->word
$27 = {word = 0x8ee880 "/bin/echo", flags = 0}
(gdb) p *words->next->word
$28 = {word = 0x8ee940 "1", flags = 8}
(gdb) p *words->next->next->word
$29 = {word = 0x8ee100 "g", flags = 8}
接着看frame 3,在执行execute_disk_command()
函数的时候,words参数也是有问题的。
(gdb) frame 3
#3 execute_simple_command (simple_command=<value optimized out>, pipe_in=-1, pipe_out=-1, async=<value optimized out>, fds_to_close=<value optimized out>)
at execute_cmd.c:3863
3863 execute_disk_command (words, simple_command->redirects, command_line,
(gdb) list execute_cmd.c:3863
3853 }
3854
3855 if (command_line == 0)
3856 command_line = savestring (the_printed_command_except_trap);
3857
3858 #if defined (PROCESS_SUBSTITUTION)
3859 if ((subshell_environment & SUBSHELL_COMSUB) && (simple_command->flags & CMD_NO_FORK) && fifos_pending() > 0)
3860 simple_command->flags &= ~CMD_NO_FORK;
3861 #endif
3862
3863 execute_disk_command (words, simple_command->redirects, command_line,
3864 pipe_in, pipe_out, async, fds_to_close,
3865 simple_command->flags);
3866
3867 return_result:
3868 bind_lastarg (lastarg);
3869 FREE (command_line);
3870 dispose_words (words);
3871 discard_unwind_frame ("simple-command");
3872 this_command_name = (char *)NULL; /* points to freed memory now */
(gdb) p words
$31 = (WORD_LIST *) 0x8ee960
(gdb) p *words
$33 = {next = 0x8ee8a0, word = 0x8ebf30}
(gdb) p *words->word
$34 = {word = 0x8ee880 "/bin/echo", flags = 0}
(gdb) p *words->next->word
$35 = {word = 0x8ee940 "1", flags = 8}
(gdb) p *words->next->next->word
$36 = {word = 0x8ee100 "g", flags = 8}
(gdb) p command_line
$32 = 0x8ee9a0 "/bin/echo [g1]"
再看frame 4,可以看到传给execute_simple_command()
函数的参数是对的。也就是说在执行execute_simple_command()
函数的时候,中间某一步words的值变了,然后对执行execute_disk_command()
函数的时候就开始不对了。
(gdb) frame 4
#4 0x0000000000430463 in execute_command_internal (command=0x8ee6f0, asynchronous=<value optimized out>, pipe_in=-1, pipe_out=-1, fds_to_close=0x8ee740)
at execute_cmd.c:741
741 execute_simple_command (command->value.Simple, pipe_in, pipe_out,
(gdb) list execute_cmd.c:741
731 last_pid = last_made_pid;
732 was_error_trap = signal_is_trapped (ERROR_TRAP) && signal_is_ignored (ERROR_TRAP) == 0;
733
734 if (ignore_return && command->value.Simple)
735 command->value.Simple->flags |= CMD_IGNORE_RETURN;
736 if (command->flags & CMD_STDIN_REDIR)
737 command->value.Simple->flags |= CMD_STDIN_REDIR;
738
739 line_number_for_err_trap = line_number = command->value.Simple->line;
740 exec_result =
741 execute_simple_command (command->value.Simple, pipe_in, pipe_out,
742 asynchronous, fds_to_close);
743 line_number = save_line_number;
744
745 /* The temporary environment should be used for only the simple
746 command immediately following its definition. */
747 dispose_used_env_vars ();
748
749 #if (defined (ultrix) && defined (mips)) || defined (C_ALLOCA)
750 /* Reclaim memory allocated with alloca () on machines which */
(gdb) p command
$37 = (COMMAND *) 0x8ee6f0
(gdb) p *command
$38 = {type = cm_simple, flags = 64, line = 0, redirects = 0x0, value = {For = 0x8ee720, Case = 0x8ee720, While = 0x8ee720, If = 0x8ee720,
Connection = 0x8ee720, Simple = 0x8ee720, Function_def = 0x8ee720, Group = 0x8ee720, Select = 0x8ee720, Arith = 0x8ee720, Cond = 0x8ee720,
ArithFor = 0x8ee720, Subshell = 0x8ee720, Coproc = 0x8ee720}}
(gdb) p *command->Simple
There is no member named Simple.
(gdb) p *command->value.Simple
$39 = {flags = 64, line = 0, words = 0x8ec0e0, redirects = 0x0}
(gdb) p *command->value.Simple->words
$40 = {next = 0x8ec0c0, word = 0x8ee670}
(gdb) p *command->value.Simple->words->word
$41 = {word = 0x8ee690 "/bin/echo", flags = 0}
(gdb) p *command->value.Simple->words->next->word
$42 = {word = 0x8ee6d0 "[g1]", flags = 0}
上面最简单的定位部分就完成了,下面是对execute_simple_command()
函数进行断点,一步步分析。 其中的过程就是在断点开始,对着代码,用n
命令一直下一步,用p
命令看words这个数据有没有变化。重复的步骤就不啰嗦了。
(gdb) break execute_simple_command
Breakpoint 2 at 0x42eb80: file execute_cmd.c, line 3540.
说重点,在bash-4.1/execute_cmd.c:3656这行words的值是有变化的,但值也是被optimized out了,执行到3667行的时候,可以看到words变了。所以要step in看看expand_words
做了什么。
3657 if (simple_command->flags & CMD_IGNORE_RETURN)
(gdb)
3656 words = expand_words (simple_command->words);
(gdb) p *words->next->word
value has been optimized out
(gdb) n
3657 if (simple_command->flags & CMD_IGNORE_RETURN)
(gdb)
3667 if (words == 0)
(gdb) p *words->next->word
$1 = {word = 0x8ee940 "1", flags = 8}
这里list是expand_words()
传进来的值,而new_list是返回的值,到这一步还是正确的。
Breakpoint 2, execute_simple_command (simple_command=0x8ee720, pipe_in=-1, pipe_out=-1, async=<value optimized out>, fds_to_close=<value optimized out>)
at execute_cmd.c:3656
3656 words = expand_words (simple_command->words);
(gdb) s
expand_words (list=0x8ec0e0) at subst.c:8748
8748 return (expand_word_list_internal (list, WEXP_ALL));
(gdb) n
expand_word_list_internal (list=0x8ec0e0, eflags=31) at subst.c:9074
9074 {
(gdb)
9078 if (list == 0)
(gdb)
9074 {
(gdb)
9078 if (list == 0)
(gdb)
9081 garglist = new_list = copy_word_list (list);
(gdb)
9082 if (eflags & WEXP_VARASSIGN)
(gdb)
9084 garglist = new_list = separate_out_assignments (new_list);
(gdb)
9126 new_list = shell_expand_word_list (new_list, eflags);
(gdb) p *list->next->word
value has been optimized out
(gdb) p *new_list->next->word
$4 = {word = 0x8ee880 "[g1]", flags = 0}
这里有个问题是,很多时候由于编译器优化的原因,不能获取值。比如expand_words()
执行完了,值也变了,我们没看到是哪一步变的。这时候只能在这个函数从头再来一次,再step into更底层次的函数里。
9134 new_list = glob_expand_word_list (new_list, eflags);
(gdb) p *new_list->next->word
value has been optimized out
(gdb) n
9140 if ((eflags & WEXP_VARASSIGN) && subst_assign_varlist)
(gdb)
9186 }
(gdb)
execute_simple_command (simple_command=0x8ee720, pipe_in=-1, pipe_out=-1, async=<value optimized out>, fds_to_close=<value optimized out>)
at execute_cmd.c:3657
3657 if (simple_command->flags & CMD_IGNORE_RETURN)
(gdb)
当然,我们从代码上可以看出,修改的地方可能有下面几个,这些都是跟Bash - Shell Expansions相关的。看文档比看代码里的字符操作要更便捷一些。
// bash-4.1/subst.c
new_list = brace_expand_word_list (new_list, eflags);
new_list = shell_expand_word_list (new_list, eflags);
new_list = glob_expand_word_list (new_list, eflags);
new_list = dequote_list (new_list);
其实用户的问题是Filename Expansion - Pattern Matching问题,bash会先把[3jnrw:mvlewgtf09u23434e12ermld234r25cscfw]
去做文件名匹配。由于当前目录下有文件1
和g
,所以,一大串字符会扩展成1
和g
再传给echo命令,结果就是输出1
和g
。如果用引号引起来,就没有这个问题了。或者去别的没有这两个文件的目录,也不会有问题。
$ ls -l 1 g
-rw-r--r-- 1 root root 7284 Apr 20 16:44 1
-rw-r--r-- 1 root root 169 Apr 16 14:42 g
$ echo [3jnrw:mvlewgtf09u23434e12ermld234r25cscfw]
1 g
$ echo '[3jnrw:mvlewgtf09u23434e12ermld234r25cscfw]'
[3jnrw:mvlewgtf09u23434e12ermld234r25cscfw]
$ cd /tmp
$ echo [3jnrw:mvlewgtf09u23434e12ermld234r25cscfw]
[3jnrw:mvlewgtf09u23434e12ermld234r25cscfw]
总结
费了这么大周折来搞明白这个问题,想想真有点傻。Filename Expansion虽然平时也有用,但就是没想起来,导致花费了不少的时间。有时候真应该多想想,三思而后行,而不是上来就蛮干。