From: porkchop@invalid.foo   
      
   On Wed, 10 Dec 2025 22:07:24 GMT, Scott Lurndal wrote:   
      
   > Typically a soi disant extended ASCII character set (e.g. ISO-8859-1)   
   > have the first 32 bytes starting at 128 defined as control characters.   
   >   
   > https://en.wikipedia.org/wiki/ISO/IEC_8859-1#Code_page_layout   
      
   Many thanks Scott. Here's my final stab at the idea.   
      
   Beware word-wrap...   
      
   #include    
   #include    
      
   /*   
    * is_text_file()   
    *   
    * Determines whether a file is "probably text" or binary, using a heuristic   
    * based on mostly printable characters.   
    *   
    * Detection modes:   
    * TEXT_LOOSE - Allows ASCII printable bytes (0x20–0x7E), TAB/LF/CR,   
    * and all high-bit bytes (>=128). Tolerant for UTF-8 or   
    * ISO-8859-1 text.   
    * TEXT_STRICT - Rejects ASCII control characters (0x00–0x08,   
   0x0B–0x0C,   
    * 0x0E–0x1F) and C1 controls (0x80–0x9F). Counts only   
    * clearly printable bytes.   
    * TEXT_ISO8859_1 - Accepts ASCII printable (0x20–0x7E), ISO-8859-1   
    * printable bytes (0xA0–0xFF), and TAB/LF/CR. Rejects   
    * C1 controls (0x80–0x9F).   
    *   
    * Returns:   
    * 1 file is probably text (>=90% printable characters)   
    * 0 file is probably binary (too many non-printable characters)   
    * -1 empty file   
    * -2 could not open file   
    */   
      
   typedef enum {   
    TEXT_LOOSE, // mostly printable: ASCII + high-bit   
    TEXT_STRICT, // stricter: reject C1 controls   
    TEXT_ISO8859_1 // ISO-8859-1 printable (0x20–0x7E + 0xA0–0xFF)   
   } text_mode_t;   
      
   static const uint8_t MASK[16] = {   
    0x00, 0x24, 0x00, 0x00, // 0x00–0x0F: TAB(09), LF(0A), CR(0D)   
    0xFF, 0xFF, 0xFF, 0xFF, // 0x10–0x2F: SPC!"#$%&'()*+,-./   
    0xFF, 0xFF, 0xFF, 0xFF, // 0x30–0x4F: 0123456789:;<=>?@   
    0xFF, 0xFF, 0xFF, 0x7F // 0x50–0x7F: A–Z [\]^_` a–z (exclude DEL)   
   };   
      
   int is_text_file(const char *path, text_mode_t mode) {   
    FILE *f = fopen(path, "rb");   
    if (!f) return -2;   
      
    unsigned char chunk[4096];   
    uint64_t n, i, good = 0, total = 0;   
      
    while ((n = fread(chunk, 1, sizeof(chunk), f)) > 0) {   
    total += n;   
      
    for (i = 0; i < n; i++) {   
    unsigned char c = chunk[i];   
      
    switch (mode) {   
    case TEXT_LOOSE:   
    if (c >= 128 || (c < 128 && (MASK[c >> 3] & (1 << (c &   
   7))))) good++;   
    break;   
      
    case TEXT_STRICT: // reject C1 controls 0x80–0x9F   
    if ((c >= 128 && c <= 159) || (c < 128 && !(MASK[c >> 3] &   
   (1 << (c & 7))))) {   
    // bad byte, do not count...   
    } else good++;   
    break;   
      
    case TEXT_ISO8859_1: // accept 0x20–0x7E + 0xA0–0xFF,   
   reject C1 controls   
    if ((c >= 0x20 && c <= 0x7E) || (c >= 0xA0 && c <= 0xFF)   
    || c == 0x09 || c == 0x0A || c == 0x0D) { good++; }   
    break;   
    }   
    }   
    }   
      
    fclose(f);   
      
    if (total == 0) return -1; // empty file   
      
    return (good * 10 >= total * 9) ? 1 : 0; // 90% threshold   
   }   
      
   --   
   :wq   
   Mike Sanders   
      
   --- SoupGate-Win32 v1.05   
    * Origin: you cannot sedate... all the things you hate (1:229/2)   
|