home bbs files messages ]

Forums before death by AOL, social media and spammers... "We can't have nice things"

   comp.lang.c      Meh, in C you gotta define EVERYTHING      243,242 messages   

[   << oldest   |   < older   |   list   |   newer >   |   newest >>   ]

   Message 242,424 of 243,242   
   Michael Sanders to Scott Lurndal   
   Re: is_binary_file()   
   11 Dec 25 01:09:59   
   
   From: porkchop@invalid.foo   
      
   On Wed, 10 Dec 2025 22:07:24 GMT, Scott Lurndal wrote:   
      
   > Typically a soi disant extended ASCII character set (e.g. ISO-8859-1)   
   > have the first 32 bytes starting at 128 defined as control characters.   
   >   
   > https://en.wikipedia.org/wiki/ISO/IEC_8859-1#Code_page_layout   
      
   Many thanks Scott. Here's my final stab at the idea.   
      
   Beware word-wrap...   
      
   #include    
   #include    
      
   /*   
    * is_text_file()   
    *   
    * Determines whether a file is "probably text" or binary, using a heuristic   
    * based on mostly printable characters.   
    *   
    * Detection modes:   
    *   TEXT_LOOSE     - Allows ASCII printable bytes (0x20–0x7E), TAB/LF/CR,   
    *                    and all high-bit bytes (>=128). Tolerant for UTF-8 or   
    *                    ISO-8859-1 text.   
    *   TEXT_STRICT    - Rejects ASCII control characters (0x00–0x08,   
   0x0B–0x0C,   
    *                    0x0E–0x1F) and C1 controls (0x80–0x9F). Counts only   
    *                    clearly printable bytes.   
    *   TEXT_ISO8859_1 - Accepts ASCII printable (0x20–0x7E), ISO-8859-1   
    *                    printable bytes (0xA0–0xFF), and TAB/LF/CR. Rejects   
    *                    C1 controls (0x80–0x9F).   
    *   
    * Returns:   
    *   1 file is probably text (>=90% printable characters)   
    *   0 file is probably binary (too many non-printable characters)   
    *  -1 empty file   
    *  -2 could not open file   
    */   
      
   typedef enum {   
       TEXT_LOOSE,    // mostly printable: ASCII + high-bit   
       TEXT_STRICT,   // stricter: reject C1 controls   
       TEXT_ISO8859_1 // ISO-8859-1 printable (0x20–0x7E + 0xA0–0xFF)   
   } text_mode_t;   
      
   static const uint8_t MASK[16] = {   
       0x00, 0x24, 0x00, 0x00, // 0x00–0x0F: TAB(09), LF(0A), CR(0D)   
       0xFF, 0xFF, 0xFF, 0xFF, // 0x10–0x2F: SPC!"#$%&'()*+,-./   
       0xFF, 0xFF, 0xFF, 0xFF, // 0x30–0x4F: 0123456789:;<=>?@   
       0xFF, 0xFF, 0xFF, 0x7F  // 0x50–0x7F: A–Z [\]^_` a–z (exclude DEL)   
   };   
      
   int is_text_file(const char *path, text_mode_t mode) {   
       FILE *f = fopen(path, "rb");   
       if (!f) return -2;   
      
       unsigned char chunk[4096];   
       uint64_t n, i, good = 0, total = 0;   
      
       while ((n = fread(chunk, 1, sizeof(chunk), f)) > 0) {   
           total += n;   
      
           for (i = 0; i < n; i++) {   
               unsigned char c = chunk[i];   
      
               switch (mode) {   
                   case TEXT_LOOSE:   
                       if (c >= 128 || (c < 128 && (MASK[c >> 3] & (1 << (c &   
   7))))) good++;   
                       break;   
      
                   case TEXT_STRICT: // reject C1 controls 0x80–0x9F   
                       if ((c >= 128 && c <= 159) || (c < 128 && !(MASK[c >> 3] &   
   (1 << (c & 7))))) {   
                           // bad byte, do not count...   
                       } else good++;   
                       break;   
      
                   case TEXT_ISO8859_1: // accept 0x20–0x7E + 0xA0–0xFF,   
   reject C1 controls   
                       if ((c >= 0x20 && c <= 0x7E) || (c >= 0xA0 && c <= 0xFF)   
                           || c == 0x09 || c == 0x0A || c == 0x0D) { good++; }   
                       break;   
               }   
           }   
       }   
      
       fclose(f);   
      
       if (total == 0) return -1; // empty file   
      
       return (good * 10 >= total * 9) ? 1 : 0; // 90% threshold   
   }   
      
   --   
   :wq   
   Mike Sanders   
      
   --- SoupGate-Win32 v1.05   
    * Origin: you cannot sedate... all the things you hate (1:229/2)   

[   << oldest   |   < older   |   list   |   newer >   |   newest >>   ]


(c) 1994,  bbs@darkrealms.ca