From: porkchop@invalid.foo   
      
   On Thu, 11 Dec 2025 01:09:59 -0000 (UTC), Michael Sanders wrote:   
      
   > [...]   
   >   
   > if (c >= 128 || (c < 128 && (MASK[c >> 3] & (1 << (c & 7))))) good++;   
   >   
   > [...]   
      
   Thinking about it more, the bit-twiddling method while fast,   
   is certainly not very readable/maintainable. Those who might   
   want to use any of the variations I've written, will best be   
   served using the one shown below. Not all the bells & whistles   
   of the prior offering, but sometimes that's good thing.   
      
   Note: If you keep map[] 'out in the open' (globally exposed)   
   its only computed once at runtime instead everytime...   
      
   Well off to work for me.   
      
   #include    
   #include    
      
   /*   
    * is_text_file()   
    *   
    * Determines whether a file is 'probably text' based on ISO-8859-1 rules.   
    * Uses a precomputed lookup table for fast byte validation.   
    *   
    * Valid bytes:   
    * - ASCII printable: 0x20–0x7E   
    * - ISO-8859-1 high printable: 0xA0–0xFF   
    * - Whitespace/control: TAB (0x09), LF (0x0A), CR (0x0D)   
    *   
    * Invalid bytes (binary indicators):   
    * - NULL byte (0x00)   
    * - C0 controls (0x01–0x08, 0x0B–0x0C, 0x0E–0x1F)   
    * - DEL (0x7F)   
    * - C1 controls (0x80–0x9F)   
    *   
    * Returns:   
    * 1 - file is considered text   
    * 0 - file is considered binary   
    * -1 - could not open file   
    */   
      
   static const uint8_t map[256] = {   
    0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0, // 00   
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 10   
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 20   
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 30   
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 40   
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 50   
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 60   
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0, // 70   
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 80   
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 90   
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // A0   
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // B0   
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // C0   
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // D0   
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // E0   
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 // F0   
   };   
      
   int is_text_file(const char *path) {   
      
    FILE *f = fopen(path, "rb");   
    if (!f) return -1; // could not open file   
      
    // larger chunk size means less 'touching' the drive   
    unsigned char chunk[65536];   
    size_t n, i;   
      
    while ((n = fread(chunk, 1, sizeof(chunk), f)) > 0) {   
    for (i = 0; i < n; i++) {   
    if (!map[chunk[i]]) {   
    fclose(f);   
    return 0; // binary detected   
    }   
    }   
    }   
      
    fclose(f);   
    return 1; // probally text   
   }   
      
   // eof   
      
   --   
   :wq   
   Mike Sanders   
      
   --- SoupGate-Win32 v1.05   
    * Origin: you cannot sedate... all the things you hate (1:229/2)   
|