Home My Page Projects Code Snippets Project Openings diderot
Summary Activity Tracker Tasks SCM

SCM Repository

[diderot] Annotation of /branches/vis15/config/expand-utf8.c
ViewVC logotype

Annotation of /branches/vis15/config/expand-utf8.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3722 - (view) (download) (as text)

1 : jhr 3722 /*! \file expand-utf8.c
2 :     *
3 :     * \author John Reppy
4 :     *
5 :     * This filter replaces non-ASCII UTF-8 multibyte sequences with sequences of SML
6 :     * string escapes. It's purpose is to allow building Diderot using the MLton
7 :     * compiler (mlton.org), which does not allow non-7-bit ASCII characters in string
8 :     * literals. Note that we assume that the source is valid w.r.t. SML/NJ, so we
9 :     * look for UTF-8 multibyte headers independent of context.
10 :     */
11 :    
12 :     /*
13 :     * This code is part of the Diderot Project (http://diderot-language.cs.uchicago.edu)
14 :     *
15 :     * COPYRIGHT (c) 2016 The University of Chicago
16 :     * All rights reserved.
17 :     */
18 :    
19 :     #include <ctype.h>
20 :     #include <stdio.h>
21 :    
22 :     #define TX_MASK 0xC0 /* 11xx xxxx */
23 :     #define TX_VAL 0x80 /* 10xx xxxx */
24 :     #define T1_MASK 0xE0 /* 111x xxxx */
25 :     #define T1_VAL 0xC0 /* 110x xxxx */
26 :     #define T2_MASK 0xF0 /* 1111 xxxx */
27 :     #define T2_VAL 0xE0 /* 1110 xxxx */
28 :     #define T3_MASK 0xF8 /* 1111 1xxx */
29 :     #define T3_VAL 0xF0 /* 1111 0xxx */
30 :    
31 :     void putEscapeChar (unsigned int c)
32 :     {
33 :     printf ("\\%03d", c & 0xff);
34 :     }
35 :    
36 :     int main ()
37 :     {
38 :     unsigned int c;
39 :    
40 :     while ((c = getchar()) != EOF) {
41 :     if (c <= 0x7f) { /* ASCII */
42 :     putchar (c);
43 :     }
44 :     else { /* multibyte */
45 :     int i, nc;
46 :     if ((c & T1_MASK) == T1_VAL) { nc = 1; }
47 :     else if ((c & T2_MASK) == T2_VAL) { nc = 2; }
48 :     else if ((c & T3_MASK) == T3_VAL) { nc = 3; }
49 :     else {
50 :     fprintf (stderr, "expand-utf8: invalid UTF-8 header byte %#0x\n", c);
51 :     return 1;
52 :     }
53 :     putEscapeChar (c);
54 :     for (i = 0; i < nc; i++) {
55 :     if ((c = getchar()) == EOF) {
56 :     fprintf (stderr, "expand-utf8: unexpected EOF in multibyte character\n");
57 :     return 1;
58 :     }
59 :     else if ((c & TX_MASK) != TX_VAL) {
60 :     fprintf (stderr, "expand-utf8: invalid UTF-8 extension byte %#0x\n", c);
61 :     return 1;
62 :     }
63 :     putEscapeChar(c);
64 :     }
65 :     }
66 :     }
67 :    
68 :     if (ferror(stdin)) {
69 :     perror ("expand-utf8");
70 :     return 1;
71 :     }
72 :    
73 :     return 0;
74 :    
75 :     }

root@smlnj-gforge.cs.uchicago.edu
ViewVC Help
Powered by ViewVC 1.0.0