JFP 開発ガイド

複数バイト表現での文字列処理

表 2-1 で主な複数バイト表現の文字列操作のための国際化 API を紹介します。他にもAPI が用意されています。詳しくはマニュアルページ (Intro(3) など) を参照してください。

表 2-1 主な複数バイト文字列操作 API


`インタフェース名`	`作用`
`strcat(s1,s2)`	`s2` を `s1` に追加する。追加後の `s1` が返る
`strncat(s1,s2,n)`	`s2` のうち最大 `n` バイトを `s1` に追加する。追加後の `s1` が返る
`strcmp(s1,s2)`	`s1` と `s2` の大小関係を調べる。 (順序情報に基づかない)
`strncmp(s1,s2,n)`	`s1` と `s2` の最大 `n` バイトの大小関係を調べる。 (順序情報に基づかない)
`strcoll(s1,s2)`	順序情報に基づき `s1` と `s2` の大小関係を調べる
`strcpy(s1,s2)`	`s2` を `s1` にコピーする。コピー後の `s1` が返る
`strncpy(s1,s2,n)`	`s2` の最大 `n` バイトを `s1` にコピーする。コピー後の `s1` が返る
`strlen(s)`	`s` の長さをバイト数で返す
`strxfrm(s1,s2,n)`	大小関係を調べるための文字列の変形

注 -

このほか文字列に含まれる文字を検出する strchr() および strrchr() がありますが、複数バイト文字列に対してこれらの API は正しく動作しません。複数バイト文字列に含まれる文字を検出する場合は、文字列を mbstowcs() (後述) などでワイド文字列に変換してから wcschr() または wcsrchr() を使用してください。

プログラム例

複数バイト文字列操作の API を用いたプログラム例を紹介します。使用する場合には string.h ヘッダファイルを取り込むこと、また setlocale() を処理の最初の段階で呼び出して動作ロケールを適切に設定することが必要です。

この例では文字列を比較しています。一般に文字列の順序関係は文字列を構成する各文字の順序関係で決まり、文字の順序関係はロケールごとに LC_COLLATE カテゴリに定義されています。strcmp() は比較文字列をバイト単位で比較するため、正しい順序関係を構成しない場合があります。順序情報に基づく比較を行うには strcoll() を使用する必要がありますが、一般には strcmp() に比べて低速です。strxfrm() は、本来 strcoll() で比較すべき文字列を変形させます。変形は、変形されたあとの文字列同士を strcmp() で比較した結果が変形する前の文字列同士をstrcoll() で比較した結果と同一になるように行なわれます。データベースを管理するなど多くのデータを順序関係に基づいて並べ換える場合には、効率化が期待できます。

例 2-1 において、main() 関数から my_strcoll() を呼び出していますが、システムが提供する strcoll() を呼び出すように変更しても全く同じ結果が得られます。

例 2-1 複数バイト文字列操作 API

sun% cat my_strcoll.c
/*
 * Read lines from two files, and return the
 * order that is the same as they are compared
 * by strcoll().
 * Comparing will stop if either file reaches EOF.
 * It is assumed that each line has at most BUFSIZ - 1
 * byte length.
 *
 * Actual processing is done by my_strcoll(), which
 * does the followings.
 *	1.	Call strxfrm() to get the size of
 *		transformed string.
 *	2.	Dynamically allocate the memory the
 *		buffer. It will be big enough to contain
 *		the transformed string and terminating NULL.
 *	3.	Call strxfrm() again to get
 *		the transformed string. To verify if
 *		the error happens, it must clear `errno'
 *		then call strxfrm(). After that, check
 *		the value of `errno.'
 *	4.	Call strcmp() with the transformed strings.
 *		Since these strings are artificialy created,
 *		they are not allowed to display.
 */

#include <stdio.h>
#include <locale.h>
#include <string.h>
#include <stdlib.h>
#include <errno.h>

static int my_strcoll(const char *, const char *);

int
main(int argc, char *argv[])
{
	FILE *fp1, *fp2;
	char buf1[BUFSIZ], buf2[BUFSIZ];
	char *cp1, *cp2;
	int retval;

	setlocale(LC_ALL, "");

	if (argc != 3) {
		fprintf(stderr, "¥tUsage: %s file1 file2¥n", argv[0]);
		exit(-1);
	}
 fp1 = fopen((const char *)argv[1], "r");
	fp2 = fopen((const char *)argv[2], "r");
	if ((fp1 == (FILE *)NULL) || (fp2 == (FILE *)NULL)) {
		fprintf(stderr, "%s: Couldn't open %s ¥n",
			argv[0],
			((fp1 == (FILE *)NULL) ? argv[1] : argv[2]));
		exit(-1);
	}
	for (;;) {
		cp1 = fgets(buf1, BUFSIZ, fp1);
		cp2 = fgets(buf2, BUFSIZ, fp2);
		if (!cp1 && !cp2) {
				exit(0);
		} else if (!cp1 || !cp2) {
					fprintf(stderr, "%s: No more contents in %s¥n",
							argv[0], (cp1 ? argv[2] : argv[1]));
					exit(0);
		}
		retval = my_strcoll((const char *)buf1, (const char *)buf2);
		if (retval == 0) {
			fprintf(stdout, "The same collation order.¥n");
		} else if (retval > 0) {
			fprintf(stdout,
				"%s is bigger than %s in terms of collation order.¥n",
				argv[1], argv[2]);
		} else {
			fprintf(stdout,
				"%s is bigger than %s in terms of collation order.¥n",
				argv[2], argv[1]);
		}
	}
	return (0);
}

static int
my_strcoll(const char *cp1, const char *cp2)
{
	char *transform_1, *transform_2;
	size_t xfrm_len1, xfrm_len2;
	int ret_coll;

	xfrm_len1 = strxfrm((char *)NULL, cp1, (size_t)0);
	xfrm_len2 = strxfrm((char *)NULL, cp2, (size_t)0);
	transform_1 = (char *)malloc(xfrm_len1 + 1);
	transform_2 = (char *)malloc(xfrm_len2 + 1);

	errno = 0;
strxfrm(transform_1, cp1, (xfrm_len1 + 1));
	if (errno != 0) {
		perror("my_strcoll(): Error in transforming 1st string");
		exit(-1);
	}
	strxfrm(transform_2, cp2, (xfrm_len2 + 1));
	if (errno != 0) {
		perror("my_strcoll(): Error in transforming 2nd string");
		exit(-1);
	}
	ret_coll = strcmp((const char *)transform_1, (const char *)transform_2);

	free(transform_1);
	free(transform_2);

	return (ret_coll);
}
sun% cat file 1
入力サンプル 1 です。
This line is identical.
短いです。
sun% cat file 2
入力サンプル 2 です。
This line is identical.
こちらの行は長くなっています。
sun% cc -o my_strcoll my_strcoll.c
sun% ./my_strcoll file1 file2./my_strcoll file1 file2
file2 is bigger than file1 in terms of collation order.
The same collation order.
file1 is bigger than file2 in terms of collation order.
./my_strcoll: No more contents in file1