讀取 `/usr/share/dict/words` 的副本比文件本身慢 10 倍

October 21, 2021

我正在嘗試用 C 語言實現字典，發現/usr/share/dict/words它是一個非常好的測試文件。無論出於何種原因，我想將 words 文件的副本複製到我的工作目錄中，但令我驚訝的是，該程序在讀取文件時明顯變慢了。什麼可以解釋這種行為？這兩個文件是相同的。

如果我不得不猜測，它可能/usr/share/dict/words已經緩沖在記憶體中，因為它是一個經常使用的文件？

#define _GNU_SOURCE

#include &lt;assert.h&gt;
#include &lt;ctype.h&gt;
#include &lt;stdio.h&gt;
#include &lt;stdlib.h&gt;
#include &lt;string.h&gt;
#include &lt;sys/time.h&gt;

#define GET_TIME(now)                           \
   do {                                        \
       struct timeval t;                       \
       gettimeofday(&t, NULL);                 \
       now = t.tv_sec + t.tv_usec / 1000000.0; \
   } while (0)

#define REPORT(msg, time)                   \
   do {                                    \
       printf("%-10s- %f\n", msg, time);   \
   } while (0)

#define SHOW_INVALID    0

struct dict {
   int n;
   char *data[110000];
};

int valid_word(char *input)
{
   for (int i = 0; input[i]; i++) {
       if (!islower(input[i]) && !(input[i] == '\n')) {
           return 0;
       }
   }
   return 1;
}

struct dict *get_dict(char *file)
{
   struct dict *dict = calloc(1, sizeof(struct dict));
   FILE *fp = fopen(file, "r");
   char input[128];
   while (fgets(input, 128, fp)) {
       if (valid_word(input)) {
           dict-&gt;data[dict-&gt;n++] = strdup(input);
       } else {
#if SHOW_INVALID == 1
           printf("Skipping invalid word %s", input);
#endif
       }
   }
   fclose(fp);
   return dict;
}

void destroy_dict(struct dict *dict)
{
   for (int i = 0; i &lt; dict-&gt;n; i++) {
       free(dict-&gt;data[i]);
   }
   free(dict);
}

int search(struct dict *dict, int l, int r, char *word)
{
   if (l &gt; r) return -1;
   int mid = l + (r - l) / 2;
   if (!strcmp(dict-&gt;data[mid], word)) return mid;
   if (strcmp(dict-&gt;data[mid], word) &gt; 0) return search(dict, l, mid - 1, word);
   return search(dict, mid + 1, r, word);
}

int match(struct dict *dict, char *word)
{
   return search(dict, 0, dict-&gt;n - 1, word);
}

void test(struct dict *dict, char *file)
{
   FILE *fp = fopen(file, "r");
   char input[128];
   while (fgets(input, 128, fp)) {
       if (valid_word(input)) {
           assert(match(dict, input) != -1);
       } else {
           assert(match(dict, input) == -1);
       }
   }
   fclose(fp);
}

int main(void)
{
   double init, start, end;
   GET_TIME(init);

   GET_TIME(start);
   struct dict *dict = get_dict("words");
   GET_TIME(end);
   REPORT("setup", end - start);

   GET_TIME(start);
   test(dict, "words");
   GET_TIME(end);
   REPORT("words", end - start);

   GET_TIME(start);
   test(dict, "words_random");
   GET_TIME(end);
   REPORT("randwords", end - start);

   GET_TIME(start);
   destroy_dict(dict);
   GET_TIME(end);
   REPORT("teardown", end - start);

   puts("");
   REPORT("total", end - init);

   return 0;
}

正如@Vilinkameni 所指出的，如果正在訪問的文件位於不同的物理設備或文件系統類型上，則 GNU/Linux 中的 I/O 性能可能會有所不同。
在我的例子中，WSL2 使用了一個虛擬硬碟，但我的工作目錄（WSLcd的目標）實際上在我的C:/驅動器上。因此，在訪問/usr/share/dict/words文件時，我仍保留在 WSL2 VHD 中，但如果我將文件複製到我的C:/驅動器，那就是性能下降的地方——因為它必須讀取另一個“文件系統”上的文件。
我通過將我的程序移動到/usr/share/dict/，在那裡創建文件的副本來測試這一點，words現在性能是相同的。

引用自：https://unix.stackexchange.com/questions/674151

讀取 `/usr/share/dict/words` 的副本比文件本身慢 10 倍

相關問答

文件的意外訪問權限

在 Linux 中沒有顯式同步的情況下，對文件的單獨阻塞 IO 訪問在執行時是否一致

fstat 是否需要在 Linux/ext4 上訪問磁碟？

執行已編譯的 C 程序時遇到問題

是 `open()`、`mmap()` 還是兩者都不是，更基本的功能？

讀取 ELF 文件的內容（以程式方式）

讀取 /usr/share/dict/words 的副本比文件本身慢 10 倍

相關問答

文件的意外訪問權限

在 Linux 中沒有顯式同步的情況下，對文件的單獨阻塞 IO 訪問在執行時是否一致

fstat 是否需要在 Linux/ext4 上訪問磁碟？

執行已編譯的 C 程序時遇到問題

是 open()、mmap() 還是兩者都不是，更基本的功能？

讀取 ELF 文件的內容（以程式方式）

讀取 `/usr/share/dict/words` 的副本比文件本身慢 10 倍

是 `open()`、`mmap()` 還是兩者都不是，更基本的功能？