set2 的歷史 Unix V5 tr 命令填充行為與我們今天認為的“經典” System V（1983-1988）行為不同嗎？

January 4, 2014

該tr命令已有近 40 年的歷史。它似乎在 1973 年與Unix V4一起首次出現在 Unix 中。這方面的來源不可用。這可能是大約 6 個月後的 1974 年 6 月Unix V5中該命令的第二古老的 Unix 實現：
int dflag 0;
int sflag 0;
int cflag 0;
int save 0;
char code[256];
char squeez[256];
char vect[256];
struct string { int last, max, rep; char *p; } string1, string2;
int inbuf[259];

main(argc,argv)
char **argv;
{
   int i, j;
   int c, d;
   char *compl;
   extern fout;

   string1.last = string2.last = 0;
   string1.max = string2.max = 0;
   string1.rep = string2.rep = 0;
   string1.p = string2.p = "";

   if(--argc&gt;0) {
       argv++;
       if(*argv[0]=='-'&&argv[0][4]!=0) {
           while(*++argv[0])
               switch(*argv[0]) {
               case 'c':
                   cflag++;
                   continue;
               case 'd':
                   dflag++;
                   continue;
               case 's':
                   sflag++;
                   continue;
               }
           argc--;
           argv++;
       }
   }
   if(argc&gt;0) string1.p = argv[0];
   if(argc&gt;1) string2.p = argv[1];
   for(i=0; i&lt;256; i++)
       code[i] = vect[i] = 0;
   if(cflag) {
       while(c = next(&string1))
           vect[c&0377] = 1;
       j = 0;
       for(i=1; i&lt;256; i++)
           if(vect[i]==0) vect[j++] = i;
       vect[j] = 0;
       compl = vect;
   }
   for(i=0; i&lt;256; i++)
       squeez[i] = 0;
   for(;;){
       if(cflag) c = *compl++;
       else c = next(&string1);
       if(c==0) break;
       d = next(&string2);
       if(d==0) d = c;
       code[c&0377] = d;
       squeez[d&0377] = 1;
   }
   while(d = next(&string2))
       squeez[d&0377] = 1;
   squeez[0] = 1;
   for(i=0;i&lt;256;i++) {
       if(code[i]==0) code[i] = i;
       else if(dflag) code[i] = 0;
   }

   inbuf[0] = 0;
   fout = dup(1);
   close(1);
   while((c=getc(inbuf)) &gt;=0 ) {
       if(c == 0) continue;
       if(c = code[c&0377]&0377)
           if(!sflag || c!=save || !squeez[c&0377])
               putchar(save = c);
   }
   flush();
}

next(s)
struct string *s;
{
   int a, b, c, n;
   int base;

   if(--s-&gt;rep &gt; 0) return(s-&gt;last);
   if(s-&gt;last &lt; s-&gt;max) return(++s-&gt;last);
   if(*s-&gt;p=='[') {
       nextc(s);
       s-&gt;last = a = nextc(s);
       s-&gt;max = 0;
       switch(nextc(s)) {
       case '-':
           b = nextc(s);
           if(b&lt;a || *s-&gt;p++!=']')
               goto error;
           s-&gt;max = b;
           return(a);
       case '*':
           base = (*s-&gt;p=='0')?8:10;
           n = 0;
           while((c = *s-&gt;p)&gt;='0' && c&lt;'0'+base) {
               n = base*n + c - '0';
               s-&gt;p++;
           }
           if(*s-&gt;p++!=']') goto error;
           if(n==0) n = 1000;
           s-&gt;rep = n;
           return(a);
       default:
       error:
           write(1,"Bad string\n",11);
           exit();
       }
   }
   return(nextc(s));
}

nextc(s)
struct string *s;
{
   int c, i, n;

   c = *s-&gt;p++;
   if(c=='\\') {
       i = n = 0;
       while(i&lt;3 && (c = *s-&gt;p)&gt;='0' && c&lt;='7') {
           n = n*8 + c - '0';
           i++;
           s-&gt;p++;
       }
       if(i&gt;0) c = n;
       else c = *s-&gt;p++;
   }
   if(c==0) *--s-&gt;p = 0;
   return(c&0377);
}
隨著時間的發展，從早期開始，命令處理不同長度的集合的方式發生了變化，這裡我感興趣的是 set2 比 set1 短的情況。
GNU Coreutils 手冊討論了這種情況：
當 tr 執行翻譯時，set1 和 set2 通常具有相同的長度。如果 set1 比 set2 短，則忽略 set2 末尾的多餘字元。
另一方面，使 set1 比 set2 長是不可移植的；POSIX 說結果是未定義的。在這種情況下，BSD tr 將 set2 填充到 set1 的長度，方法是根據需要多次重複 set2 的最後一個字元。System V tr 將 set1 截斷為 set2 的長度。
預設情況下，GNU tr 像 BSD tr 一樣處理這種情況。當給出 –truncate-set1 (-t) 選項時，GNU tr 會像 System V tr > 那樣處理這種情況。對於翻譯以外的操作，此選項將被忽略。
在這種情況下，像 System V tr 那樣行事打破了相對常見的 BSD 習語：
 tr -cs A-Za-z0-9 '\012'
因為它只將零字節（set1 的補碼中的第一個元素）而不是所有非字母數字轉換為換行符。
The Open Group Base Specifications Issue 7 IEEE Std 1003.1 , 2013 Edition中也有這樣的討論：
當 string2 比 string1 短時，歷史 System V 和 BSD 系統之間會產生差異。BSD 系統用 string2 中的最後一個字元填充 string2。因此，可以執行以下操作：
tr 0123456789 d
這會將所有數字轉換為字母“d”。由於該領域在本卷 POSIX.1-2008 中未明確指定，因此 BSD 和 System V 行為都是允許的，但符合標準的應用程序不能依賴 BSD 行為。它必須按以下方式對範例進行編碼：
tr 0123456789 '
$$ d* $$'
現在，如果您閱讀V4和V5中 tr 命令的手冊頁，您會在兩者中看到以下參考：
If string2 is short, it is padded with corresponding characters from string1.
但是在 V6手冊和後來的 Unix 早期版本中省略了該引用，但是該命令的 V6 實現是與 V5 相同的行對行？所以你在手冊上有區別，但在程式碼上沒有？此外，這種實現似乎與所謂的“經典 BSD 或 System V”行為不同，即填充從 set2 元素添加或截斷到 set1 的長度。
那麼 V4-V5 實現與 System V 里程碑 Unix 有什麼不同，這種不同實現的基本原理是什麼，最終為什麼它被丟棄了？我怎樣才能找到有關這種早期命令設計的更多資訊？

區別僅在於 V4-V5 手冊中填充行為的措辭 - 但行為始終相同。就目前而言，V5 實現的結果與 System V 的結果相同，後者本身與帶有選項的 GNUtr行為相同。--truncate-set1此外，“將 set1 截斷到 set2 的長度”與“用 string1 中的相應字元填充 string2”給出相同的結果。這在實踐中意味著同樣的事情。讓我們證明這一點。

首先，您不必是開發人員就可以嘗試編譯它。將原始碼與幾乎相同的PWB/Unix 版本進行比較。您將看到唯一的區別是基本上依賴於“現代”stdio.h 資產，因此我已經剝離了它對、和的引用的來源，並將其替換inbuf為PWB /Unix 所做的 - 但這絕不應該改變算法保持不變的行為。我已經註釋了我對原始文件所做的微不足道的更改：fout``dup``flush

#include &lt;stdio.h&gt;    &lt;------ added
int dflag = 0;        &lt;------ added "=" sign to those
int sflag = 0;
int cflag = 0;
int save = 0;
char code[256];
char squeez[256];
char vect[256];
struct string { int last, max, rep; char *p; } string1, string2;
FILE *input;          &lt;------ part of the stdio framework I guess;

main(argc,argv)
char **argv;
{
   int i, j;
   int c, d;
   char *compl;

   string1.last = string2.last = 0;
   string1.max = string2.max = 0;
   string1.rep = string2.rep = 0;
   string1.p = string2.p = "";

   if(--argc&gt;0) {
       argv++;
       if(*argv[0]=='-'&&argv[0][1]!=0) {
           while(*++argv[0])
               switch(*argv[0]) {
               case 'c':
                   cflag++;
                   continue;
               case 'd':
                   dflag++;
                   continue;
               case 's':
                   sflag++;
                   continue;
               }
           argc--;
           argv++;
       }
   }
   if(argc&gt;0) string1.p = argv[0];
   if(argc&gt;1) string2.p = argv[1];
   for(i=0; i&lt;256; i++)
       code[i] = vect[i] = 0;
   if(cflag) {
       while(c = next(&string1))
           vect[c&0377] = 1;
       j = 0;
       for(i=1; i&lt;256; i++)
           if(vect[i]==0) vect[j++] = i;
       vect[j] = 0;
       compl = vect;
   }
   for(i=0; i&lt;256; i++)
       squeez[i] = 0;
   for(;;){
       if(cflag) c = *compl++;
       else c = next(&string1);
       if(c==0) break;
       d = next(&string2);
       if(d==0) d = c;
       code[c&0377] = d;
       squeez[d&0377] = 1;
   }
   while(d = next(&string2))
       squeez[d&0377] = 1;
   squeez[0] = 1;
   for(i=0;i&lt;256;i++) {
       if(code[i]==0) code[i] = i;
       else if(dflag) code[i] = 0;
   }

   input = stdin;                     &lt;------ again stdio
   while((c=getc(input)) != EOF ) {   &lt;------
       if(c == 0) continue;
       if(c = code[c&0377]&0377)
           if(!sflag || c!=save || !squeez[c&0377])
               putchar(save = c);
   }

}

next(s)
struct string *s;
{
   int a, b, c, n;
   int base;

   if(--s-&gt;rep &gt; 0) return(s-&gt;last);
   if(s-&gt;last &lt; s-&gt;max) return(++s-&gt;last);
   if(*s-&gt;p=='[') {
       nextc(s);
       s-&gt;last = a = nextc(s);
       s-&gt;max = 0;
       switch(nextc(s)) {
       case '-':
           b = nextc(s);
           if(b&lt;a || *s-&gt;p++!=']')
               goto error;
           s-&gt;max = b;
           return(a);
       case '*':
           base = (*s-&gt;p=='0')?8:10;
           n = 0;
           while((c = *s-&gt;p)&gt;='0' && c&lt;'0'+base) {
               n = base*n + c - '0';
               s-&gt;p++;
           }
           if(*s-&gt;p++!=']') goto error;
           if(n==0) n = 1000;
           s-&gt;rep = n;
           return(a);
       default:
       error:
           write(1,"Bad string\n",11);
           exit(0);     &lt;------original was exit();
       }
   }
   return(nextc(s));
}

nextc(s)
struct string *s;
{
   int c, i, n;

   c = *s-&gt;p++;
   if(c=='\\') {
       i = n = 0;
       while(i&lt;3 && (c = *s-&gt;p)&gt;='0' && c&lt;='7') {
           n = n*8 + c - '0';
           i++;
           s-&gt;p++;
       }
       if(i&gt;0) c = n;
       else c = *s-&gt;p++;
   }
   if(c==0) *--s-&gt;p = 0;
   return(c&0377);
}

所以cc tr.c編譯：

tr.c: In function ‘next’:
tr.c:118:4: warning: incompatible implicit declaration of built-in function ‘exit’ 
[enabled by default]
exit(0);
^

但是 a.out 在那裡並且有效，所以現在讓我們比較一下我們擁有的兩個程序的填充行為：

GNU 支持

#tr 0123456789 d     
0123456789 input
dddddddddd output             &lt;----- BSD classic behavior

#tr 0123456789 d123456789     &lt;----- padding set2 with set1 explicitly 
0123456789 i
d123456789 o
01234567890123456789 i
d123456789d123456789 o

#tr -t 0123456789 d           &lt;----- --truncate-set1 i.e. System V behavior
0123456789 i
d123456789 o                  &lt;----- concretely, this is what is meant by a result 
0012 i                               where set2 was padded with set1
dd12 o

#tr -t 0123456789 d123456789  &lt;----- padding set2 with set1 explicitly
0123456789 i                  
d123456789 o                  &lt;----- note this is identical to the last results

Unix V5 tr + stdio 模組

#./a.out 0123456789 d         &lt;----- our compiled version with the classic example
0123456789 i
d123456789 o

./a.out 0123456789 d123456789 &lt;----- padding set2 with set1 explicitly
0123456789 i
d123456789 o

因此，我們的 V5 版本在這方面的行為與 System V 版本完全相同。此外，用 set1 顯式填充 set2 會為所有實現產生相同的結果，因為它確保 set1 和 set2 具有相同數量的元素（當你沒有這個時，結果會在歷史上有所不同）。

最後，就結果而言，顯式填充或tr pad set2 with set1原始 V4-V5 手冊中描述的具有相同的含義truncating set1 to the length of set2- 它是用於填充的經典System V實現並產生相同的結果。儘管手冊頁有所不同，但V5tr並不是一個不同的實現。

引用自：https://unix.stackexchange.com/questions/107782

set2 的歷史 Unix V5 tr 命令填充行為與我們今天認為的“經典” System V（1983-1988）行為不同嗎？

相關問答

為什麼 tr 命令不從文件中讀取？

/etc/mtab 的歷史是什麼？它更新了什麼？

linux 核心是否對其“公共 API”進行了不兼容的更改？

導出時哪個軟體會以這種格式發出日期？

tr 不替換為空格，而是刪除字元

像 history 和 fc 但用於有用命令的個性化列表