为什么不同的 LC_ALL 设定会导致 sort 命令输出顺序不一样?

在使用 sort 命令对文本进行排序时,如果语言环境不同,得到的排序结果也会不同。

[root@rhel674 tmp]# export LC_ALL=C; sort test.txt 
1234
AAA
BBB
aaa
aab

[root@rhel674 tmp]# export LC_ALL=en_US; sort test.txt 
1234
aaa
AAA
aab
BBB

如果用 C (POSIX) 作为语言环境,得到的排序结果是按照字符对应的 ascii 码大小来排序的。而如果用 en_US 等语言环境,得到的排序结果会不同(从上可以看到,先按字母顺序排序,字母都一样的时候才区分大小写)。

从 sort 的源码可以看到,sort 实际上调用了 strcoll() 来对字符串进行比较。

/* File: src/sort.c */
static int                 
compare (struct line const *a, struct line const *b)
{                          
  int diff;                
  size_t alen, blen;   
                           
  /* First try to compare on the specified keys (if any).
     The only two cases with no key at all are unadorned sort,
     and unadorned sort -r. */
  if (keylist)             
    {                      
      diff = keycompare (a, b);
      if (diff || unique || stable)
        return diff;   
    }                      
                           
  /* If the keys all compare equal (or no keys were specified)
     fall through to the default comparison.  */
  alen = a->length - 1, blen = b->length - 1; 
                           
  if (alen == 0)           
    diff = - NONZERO (blen);
  else if (blen == 0)   
    diff = 1;              
  else if (hard_LC_COLLATE && !folding)
    {                      
      /* Note xmemcoll0 is a performance enhancement as
         it will not unconditionally write '\0' after the
         passed in buffers, which was seen to give around
         a 3% increase in performance for short lines.  */
      diff = xmemcoll0 (a->text, alen + 1, b->text, blen + 1); //<<-------------
    }                      
  else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))                                                                                                          
    diff = alen < blen ? -1 : alen != blen;
                           
  return reverse ? -diff : diff;
}                          

/* File: lib/xmemcoll.c */
/* Compare S1 (a memory block of size S1SIZE, with a NUL as last byte)
   and S2 (a memory block of size S2SIZE, with a NUL as last byte)
   according to the LC_COLLATE locale.  S1SIZE and S2SIZE must be > 0.
   Report an error and exit if there is an error.  */
                   
int                
xmemcoll0 (char const *s1, size_t s1size, char const *s2, size_t s2size)
{
  int diff = memcoll0 (s1, s1size, s2, s2size); //<<-------------
  int collation_errno = errno;
  if (collation_errno)
    collate_error (collation_errno, s1, s1size - 1, s2, s2size - 1);
  return diff;
} 


/* lib/xmemcoll.c */
/* Compare S1 (a memory block of size S1SIZE, with a NUL as last byte)
   and S2 (a memory block of size S2SIZE, with a NUL as last byte)
   according to the LC_COLLATE locale.  S1SIZE and S2SIZE must be > 0.
   Set errno to an error number if there is an error, and to zero
   otherwise.  */
int
memcoll0 (char const *s1, size_t s1size, char const *s2, size_t s2size)
{
  if (s1size == s2size && memcmp (s1, s2, s1size) == 0)
    {
      errno = 0;
      return 0;
    }
  else
    return strcoll_loop (s1, s1size, s2, s2size); //<<-------------
}                                              


/* File: lib/memcoll.c */
/* Compare S1 (with size S1SIZE) and S2 (with length S2SIZE) according
   to the LC_COLLATE locale.  S1 and S2 are both blocks of memory with
   nonzero sizes, and the last byte in each block must be a null byte.
   Set errno to an error number if there is an error, and to zero
   otherwise.  */
static int 
strcoll_loop (char const *s1, size_t s1size, char const *s2, size_t s2size)
{
  int diff;
 
  while (! (errno = 0, (diff = strcoll (s1, s2)) || errno))  //<<-------------
        
      /* strcoll found no difference, but perhaps it was fooled by NUL
         characters in the data.  Work around this problem by advancing
         past the NUL chars.  */
      size_t size1 = strlen (s1) + 1;
      size_t size2 = strlen (s2) + 1;
      s1 += size1;
      s2 += size2;
      s1size -= size1;
      s2size -= size2;
 
      if (s1size == 0)
        return - (s2size != 0); 
      if (s2size == 0)
        return 1;
    }   
 
  return diff;
}

strcoll() 则会根据相应 locale 的规则,来对字符串进行排序。而 locale 里的 LC_COLLATE,则是定义该语言环境下的排序权重。

通常,像是 en_US 之类的语言,会使用 iso14651_t1_common 的 LC_COLLATE 定义。在 RHEL/Centos 中,这个文件存放在 /usr/share/i18n/locales/iso14651_t1_common .

从这个文件中,可以找到 a/b/A/B 的排序权重。

<U0061> <a>;<BAS>;<MIN>;IGNORE # 198 a                                                                                                                                 
<U0062> <b>;<BAS>;<MIN>;IGNORE # 233 b
<U0041> <a>;<BAS>;<CAP>;IGNORE # 517 A
<U0042> <b>;<BAS>;<CAP>;IGNORE # 550 B
## 以 AaBa/AAbA 这两个字符串举例. a 和 A 有共同的 collating-symbol <a>, b 和 B 有共同的 collating-symbol <b>. 所以在第一轮比较中,AaBa/AAba 是等价的。在第二轮中,这四个字符都有共同的 collating-symbol <BAS>,所以在这轮中这两个字符串还是等价的。在第三轮中,a 和 b 是 <MIN> (小写), A 和 B 则是 <CAP> (大写)。在对字符串的第二位进行比较时,a < A,于是 AaBa < AAba。

# export LC_ALL=en_US; sort test.txt 
AaBa
AAbA

具体 LC_COLLATE 的定义规则,可参考 http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap07.html