Logo Search packages:      
Sourcecode: hex version File versions  Download package

hex.cc

//
// hex - hexadecimal dumping tool for Japanese
// copyright (c) TAGA Nayuta <nayuta@is.s.u-tokyo.ac.jp>
//


#ifdef WIN32 // Microsoft Visual C++
#define MSVC
#elif _WIN32 // Cygnus GNU Win32 gcc
#define GNUWIN32
#endif


#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#if defined(MSVC)
#include <io.h>
#include <fcntl.h>
#elif defined(GNUWIN32)
#include <sys/fcntl.h>
#include <unistd.h>
extern "C" {
int setmode(int handle, int mode);
char *strtok(char *, const char *);
}
#else
#include <unistd.h>
#endif


#define BUFSIZE 1024
typedef unsigned char  BYTE;
typedef unsigned short WORD;
#if defined(MSVC)
#define bool int
#define true  1
#define false 0
#endif


// SJIS 漢字を JIS に直す
void sjis2jis(BYTE *jis, const BYTE *sjis)
{
  BYTE b1 = sjis[0];
  BYTE b2 = sjis[1];
  if (0xe0 <= b1)
    b1 -= 0x40;
  b1 = (BYTE)(((int)b1 - 0x81) * 2 + 0x21);
  if (0x80 <= b2)
    b2--;
  b2 -= 0x1f;
  if (0x7f <= b2)
    b2 -= 0x5e, b1++;
  jis[0] = b1;
  jis[1] = b2;
}


// JIS 漢字を SJIS に直す
void jis2sjis(BYTE *sjis, const BYTE *jis)
{
  WORD b1 = (WORD)(jis[0] & 0x7f);
  WORD b2 = (WORD)(jis[1] & 0x7f);
  if (b1 % 2 == 0)
    b1--, b2 += 0x5e;
  b1 = (WORD)((b1 - 0x21) / 2 + 0x81);
  b2 += 0x1f;
  if (0x7f <= b2)
    b2++;
  if (0xa0 <= b1)
    b1 += 0x40;
  sjis[0] = (BYTE)b1;
  sjis[1] = (BYTE)b2;
}


BYTE ungetc_buf[BUFSIZE], *ungetc_p = ungetc_buf; // _ungetc 用バッファ
bool is_eof = false;


// _getc
int _getc(FILE *ifp)
{
  if (ungetc_buf == ungetc_p)
  {
    if (is_eof)
      return EOF;
    int c = fgetc(ifp);
    if (c == EOF)
      is_eof = true;
    return c;
  }
  else
    return *(--ungetc_p);
}


// _ungetc
void _ungetc(int c)
{
  if (c != EOF)
    *(ungetc_p++) = (BYTE)c;
}


// 未来の入力が data と同じかどうか
bool does_match(BYTE *data, int size, FILE *ifp)
{
  int buf[BUFSIZE];
  bool r = false;
  int i;
  for (i = 0; ; i++)
  {
    buf[i] = _getc(ifp);
    if (buf[i] == EOF || (BYTE)buf[i] != data[i])
      break;
    if (i == size - 1)
    {
      r = true;
      break;
    }
  }
  for (; 0 <= i; i--)
    _ungetc(buf[i]);
  return r;
}


char *color_str1 = "34";    // 色 1
char *color_str2 = "36";    // 色 2
bool is_tty = false;        // tty への出力かどうか
bool does_restore = false;  // --restore
bool disable_si_so = false; // -dsiso

enum _nonletter_attribute { color, bold, underline, text, dot };
_nonletter_attribute nonletter_attribute = dot; // どういう色をつけるか

enum _coding_system { iso_2022_jp, euc_japan, sjis, unknown };

#if defined(OJIS)
#define OUTPUT_EUC
#define OUTPUT_SJIS
#define OUTPUT_JIS "(default)"
_coding_system output_coding_system = iso_2022_jp;
#elif defined(OSJIS) || ((defined(MSVC)||defined(GNUWIN32)) && !defined(OEUC))
#define OUTPUT_EUC
#define OUTPUT_SJIS "(default)"
#define OUTPUT_JIS
_coding_system output_coding_system = sjis;
#else
#define OUTPUT_EUC "(default)"
#define OUTPUT_SJIS
#define OUTPUT_JIS
_coding_system output_coding_system = euc_japan;
#endif

#if defined(ISJIS) || ((defined(MSVC) || defined(GNUWIN32)) && !defined(OEUC))
#define INPUT_EUC
#define INPUT_SJIS "(default)"
_coding_system input_coding_system  = sjis;
#else
#define INPUT_EUC "(default)"
#define INPUT_SJIS
_coding_system input_coding_system  = euc_japan;
#endif


#define ESC "\x1b"
#define BS "\x8"


// 文字として表示できない文字
void output_nonletter(int c, FILE *ofp, bool use_color2 = false)
{
  if (0x0 <= c && c <= 0x1f) c += '@';
  else if (c == 0x7f)        c  = '?';
  else                       c  = '.';
  
  if (output_coding_system == iso_2022_jp)
    fprintf(ofp, ESC"(B"); // もどす
  if      (nonletter_attribute == color && use_color2)
    fprintf(ofp, ESC"[%sm%c"ESC"[m", color_str2, c);
  else if (nonletter_attribute == color)
    fprintf(ofp, ESC"[%sm%c"ESC"[m", color_str1, c);
  else if (nonletter_attribute == bold && is_tty)
    fprintf(ofp, ESC"[1m%c"ESC"[m", c);
  else if (nonletter_attribute == bold)
    fprintf(ofp, "%c"BS"%c", c, c);
  else if (nonletter_attribute == dot)
    fprintf(ofp, ".");
  else if (nonletter_attribute == underline && is_tty)
    fprintf(ofp, ESC"[4m%c"ESC"[m", c);
  else if (nonletter_attribute == underline)
    fprintf(ofp, "_"BS"%c", c);
  else if (nonletter_attribute == text)
    fprintf(ofp, "%c", c);
}


// 文字
void output_letter(int c, FILE *ofp, bool use_color2 = false)
{
  if (output_coding_system == iso_2022_jp)
    fprintf(ofp, ESC"(B"); // もどす
  if (nonletter_attribute == color && use_color2)
    fprintf(ofp, ESC"[%sm%c"ESC"[m", color_str2, c);
  else
    fprintf(ofp, "%c", c);
}


// 漢字
void output_kanji(int c1, int c2, FILE *ofp)
{
  if (output_coding_system == iso_2022_jp)
    fprintf(ofp, ESC"$B%c%c", c1, c2);
  else if (output_coding_system == sjis)
  {
    BYTE buf[2] = { c1, c2 };
    jis2sjis(buf, buf);
    fprintf(ofp, "%c%c", buf[0], buf[1]);
  }
  else
    fprintf(ofp, "%c%c", c1 | 0x80, c2 | 0x80);
}


// 半角カナ
void output_kana(int c, FILE *ofp)
{
  if (c == 0x20)
    fprintf(ofp, " ");
  else if (output_coding_system == iso_2022_jp)
    fprintf(ofp, ESC"(I%c", c);
  else if (output_coding_system == sjis)
    fprintf(ofp, "%c", c | 0x80);
  else
    fprintf(ofp, "\x8e%c", c | 0x80);
}


// 文字部分の出力
void output_letters(FILE *ifp, FILE *ofp)
{
  static int skip = 0;
  static bool skip_without_output = false;

  int i = 0;
  
  if (skip_without_output && 0 < skip)
    for (; 0 < skip; skip--, i++)
      output_nonletter(_getc(ifp), ofp, true);
  
  for (; i < 16; i++)
  {
    int c = _getc(ifp);
    if (c == EOF)
      break;
    
    if (0 < skip)
    {
      if (!skip_without_output)
      {
      if (0x20 <= c && c <= 0x7e) // ASCII
        output_letter(c, ofp, true);
      else
        output_nonletter(c, ofp, true);
      }
      skip--;
      continue;
    }
    
    //               | 新 JIS  | 旧 JIS  | NEC JIS | その他
    // 漢字開始      | ESC"$B" | ESC"$@" | ESC"K"  |
    //               | ESC"$(B"|         |         |
    // 1byte 文字開始| ESC"(J" | ESC"(J" | ESC"H"  | ESC"(B"  ESC"(H" ^O
    // 1byte 仮名開始|         |         |         | ESC"(I"          ^N
    
    static enum { none, jis, kana } mode = none; // ASCII 部分のモード
    
    if (input_coding_system == unknown)
    {
      if (0x20 <= c && c <= 0x7e)
      output_letter(c, ofp);
      else
      output_nonletter(c, ofp);
      continue;
    }
    
    if (c == 0xe && !disable_si_so) // ^N
    {
      mode = kana;
      output_nonletter(c, ofp);
      continue;
    }
    
    if (c == 0xf) // ^O
    {
      mode = none;
      output_nonletter(c, ofp);
      continue;
    }
    
    if (c == '\r' || c == '\n' || c == '\0')
      mode = none;
    
    if (c == 0x1b) // ^[
    {
      skip_without_output = false;
      if      (does_match((BYTE *)"$B" , 2, ifp)) mode = jis , skip = 2;
      else if (does_match((BYTE *)"$(B", 3, ifp)) mode = jis , skip = 3;
      else if (does_match((BYTE *)"$@" , 2, ifp)) mode = jis , skip = 2;
      else if (does_match((BYTE *)"K"  , 1, ifp)) mode = jis , skip = 1;
      else if (does_match((BYTE *)"(J" , 2, ifp)) mode = none, skip = 2;
      else if (does_match((BYTE *)"(H" , 2, ifp)) mode = none, skip = 2;
      else if (does_match((BYTE *)"(B" , 2, ifp)) mode = none, skip = 2;
      else if (does_match((BYTE *)"(I" , 2, ifp)) mode = kana, skip = 2;
      output_nonletter(c, ofp);
      continue;
    }
    
    skip_without_output = true;
    
    int c2 = _getc(ifp);
    _ungetc(c2);
    
    if (input_coding_system == euc_japan)
    {
      if ((0xa1 <= c  && c  <= 0xfe) && // 漢字 1byte 目
        (0xa1 <= c2 && c2 <= 0xfe))   // 漢字 2byte 目
      {
      output_kanji(c & ~0x80, c2 & ~0x80, ofp);
      skip = 1;
      continue;
      }
      
      if ((c == 0x8e) && // 半角カナ
        (0xa0 <= c2 && c2 <= 0xdf))
      {
      output_kana(c2 & ~0x80, ofp);
      skip_without_output = false;
      skip = 1;
      continue;
      }
    }
    else if (input_coding_system == sjis)
    {
      if (((0x81 <= c  && c  <= 0x9f) ||
         (0xe0 <= c  && c  <= 0xef)) && // 漢字 1byte 目
        ((0x40 <= c2 && c2 <= 0x7e) ||
         (0x80 <= c2 && c2 <= 0xfc)))   // 漢字 2byte 目
      {
      BYTE buf[2] = { c, c2 };
      sjis2jis(buf, buf);
      output_kanji(buf[0], buf[1], ofp);
      skip = 1;
      continue;
      }
      
      if (0xa0 <= c && c <= 0xdf) // 半角カナ
      {
      output_kana(c & ~0x80, ofp);
      continue;
      }
    }
    
    if (mode == jis)
    {
      if ((0x21 <= c  && c  <= 0x7e) && // 漢字 1byte 目
        (0x21 <= c2 && c2 <= 0x7e))   // 漢字 2byte 目
      {
      output_kanji(c, c2, ofp);
      skip = 1;
      continue;
      }
    }
    
    if (mode == kana)
    {
      if (0x20 <= c && c <= 0x5f) // 半角カナ
      {
      output_kana(c, ofp);
      continue;
      }
    }
    
    if (0x20 <= c && c <= 0x7e) // ASCII
    {
      output_letter(c, ofp);
      continue;
    }
    
    output_nonletter(c, ofp);
  }
}


void output_16(FILE *ifp, FILE *ofp)
{
  for (int address = 0; ; address += 16)
  {
    int buf[16];
    int i;
    for (i = 0; i < 16; i++)
    {
      buf[i] = _getc(ifp);
      if (i == 0)
      if (buf[i] == EOF)
        return;
      else
        fprintf(ofp, "0x%08x: ", address);
      if (buf[i] == EOF)
      fprintf(ofp, "   ");
      else
      fprintf(ofp, "%02x ", buf[i]);
      if (i == 7)
      fprintf(ofp, "- ");
    }
    for (i = 15; 0 <= i; i--)
      _ungetc(buf[i]);
    output_letters(ifp, ofp);
    if (output_coding_system == iso_2022_jp)
      fprintf(ofp, ESC"(B");
    fprintf(ofp, "\n");
  }
}


// 16 進数を数に直す
int ctoi16(int c)
{
  if      ('0' <= c && c <= '9') return c - '0';
  else if ('a' <= c && c <= 'f') return c - 'a' + 10;
  else if ('A' <= c && c <= 'F') return c - 'A' + 10;
  return -1;
}


// restore
void restore(FILE *ifp, FILE *ofp)
{
  while (1)
  {
    char c[BUFSIZE];
    fgets(c, sizeof(c), ifp);
    if (feof(ifp))
      break;
    
    int j = 12;
    int i;
    for (i = 0; i < 16; i++)
    {
      int k = ctoi16(c[j]);
      if (k < 0)
      break;
      k <<= 4;
      k += ctoi16(c[++j]);
      fprintf(ofp, "%c", k);
      j += 2;
      if (i == 7)
      j += 2;
    }
  }
}


// オプション解析
#define is_argv1(s) (strcmp(argv[1], s) == 0)
bool option(int &argc, char *argv[])
{
  bool r = true;
  while (1 < argc)
  {
    int remove = 1;
    if (is_argv1("-h") || is_argv1("-help"   ) || is_argv1("--help"   ) || 
      is_argv1("-v") || is_argv1("-version") || is_argv1("--version"))
    {
      fprintf(
      stderr,
      "hex version 2.04 - hexadecimal dumping tool for Japanese\n"
      "copyright (c) TAGA Nayuta <nayuta@is.s.u-tokyo.ac.jp>\n"
      "usage: hex [options ...] [filename]\n"
      "options:\n"
      " * show non-letter-code by ...\n"
      "    -c, --color                    : color (tty-output default)\n"
      "    -b, --bold                     : bold\n"
      "    -d, --dot                      : `.' (non-tty-output default)\n"
      "    -t, --text                     : code + '@' (not useful)\n"
      "    -u, --underline                : underline\n"
      " * output coding system is ...\n"
      "    -e, -oe, --oeuc                : *euc-japan* "OUTPUT_EUC"\n"
      "    -s, -os, --osjis               : *sjis* "OUTPUT_SJIS"\n"
      "    -j, -oj, --ojis                : *iso-2022-jp* "OUTPUT_JIS"\n"
      " * assume input coding system to be ...\n"
      "    -E, -ie, --ieuc                : *euc-japan* or *iso-2022-jp* "INPUT_EUC"\n"
      "    -S, -is, --isjis               : *sjis* or *iso-2022-jp* "INPUT_SJIS"\n"
      "    -U, -iu, --iunknown            : unknown\n"
      " * other ...\n"
      "    -cs1 <cs>, --colorstring1 <cs> : color-string1 is <cs> (ex. -cs1 '7;33')\n"
      "    -cs2 <cs>, --colorstring2 <cs> : color-string2 is <cs>\n"
      "    -siso, --enablesiso            : enable ^N/^O (SO/SI) KATAKANA (default)\n"
      "    -dsiso, --disablesiso, +siso   : disable ^N/^O (SO/SI) KATAKANA\n"
      "    -r, --restore                  : \"hex hoe | hex -r > hoge\" is \"cp hoe hoge\"\n"
      "... and all options can be set to HEX_OPTIONS environment.\n"
      );
      r = false;
    }
    
    else if (is_argv1("-c") || is_argv1("--color"    ))
      nonletter_attribute = color;
    else if (is_argv1("-b") || is_argv1("--bold"     ))
      nonletter_attribute = bold;
    else if (is_argv1("-d") || is_argv1("--dot"      ))
      nonletter_attribute = dot;
    else if (is_argv1("-t") || is_argv1("--text"     ))
      nonletter_attribute = text;
    else if (is_argv1("-u") || is_argv1("--underline"))
      nonletter_attribute = underline;
    
    else if (is_argv1("-e") || is_argv1("-oe") || is_argv1("--oeuc"    ))
      output_coding_system = euc_japan;
    else if (is_argv1("-s") || is_argv1("-os") || is_argv1("--osjis"   ))
      output_coding_system = sjis;
    else if (is_argv1("-j") || is_argv1("-oj") || is_argv1("--ojis"    ))
      output_coding_system = iso_2022_jp;
    
    else if (is_argv1("-E") || is_argv1("-ie") || is_argv1("--ieuc"    ))
      input_coding_system = euc_japan;
    else if (is_argv1("-S") || is_argv1("-is") || is_argv1("--isjis"   ))
      input_coding_system = sjis;
    else if (is_argv1("-U") || is_argv1("-iu") || is_argv1("--iunknown"))
      input_coding_system = unknown;
    
    else if ((is_argv1("-cs1") || is_argv1("--colorstring1")) && 3 <= argc)
    {
      color_str1 = argv[2];
      remove = 2;
    }
    else if ((is_argv1("-cs2") || is_argv1("--colorstring2")) && 3 <= argc)
    {
      color_str2 = argv[2];
      remove = 2;
    }
    
    else if (is_argv1("-siso") || is_argv1("--enablesiso"))
      disable_si_so = false;

    else if (is_argv1("-dsiso") || is_argv1("--disablesiso") ||
           is_argv1("+siso"))
      disable_si_so = true;
    
    else if (is_argv1("-r") || is_argv1("--restore"))
      does_restore = true;
    
    else
      break;
    
    for (int i = 1 + remove; i < argc; i++)
      argv[i - remove] = argv[i];
    argc -= remove;
  }
  return r;
}


// 環境変数からオプション解析
bool env_hex_options(void)
{
  bool r = true;
  char *hex_options = getenv("HEX_OPTIONS");
  if (hex_options)
  {
    char *buf = new char[strlen(hex_options) + 1];
    strcpy(buf, hex_options);
    
    int argc = 1;
    char *ho = strtok(buf, " \t\r\n");
    while (ho)
    {
      argc++;
      ho = strtok(NULL, " \t\r\n");
    }
    
    strcpy(buf, hex_options);
    
    char **argv = new char* [argc + 1];
    argc = 1;
    argv[argc++] = strtok(buf, " \t\r\n");
    while (1)
    {
      if (!argv[argc - 1])
      break;
      argv[argc++] = strtok(NULL, " \t\r\n");
    }
    argc--;
    
    r = option(argc, argv);
    
    delete [] argv;
    // delete [] buf; // delete しないよん
  }
  return r;
}


// main
int main(int argc, char *argv[])
{
  is_tty = isatty(fileno(stdout));
  if (is_tty)
    nonletter_attribute = color;
  
  if (!env_hex_options())
    return 1;
  if (!option(argc, argv))
    return 1;

  FILE *ifp;
  if (argc == 1 || is_argv1("-"))
  {
#if defined(MSVC)
    _setmode(fileno(stdin), _O_BINARY);
#elif defined(GNUWIN32)
    setmode(fileno(stdin), O_BINARY);
#endif
    ifp = stdin;
  }
  else
    ifp = fopen(argv[1], "rb");
  if (ifp == NULL)
  {
    fprintf(stderr, "%s: %s: No such file\n", argv[0], argv[1]);
    return 1;
  }
  
  FILE *ofp = stdout;
  if (ofp)
  {
    if (does_restore)
    {
#if defined(MSVC)
      _setmode(fileno(stdout), _O_BINARY);
#elif defined(GNUWIN32)
      setmode(fileno(stdout), O_BINARY);
#endif
      restore(ifp, ofp);
    }
    else
      output_16(ifp, ofp);
    fclose(ofp);
  }
  fclose(ifp);
  return 0;
}

Generated by  Doxygen 1.6.0   Back to index