[Date Prev][Date Next][Thread Prev][][Date Index][Thread Index]

似非多言語化 emacs-w3m



>> On Thu, 07 Jun 2001 16:26:02 +0900 (JST)
>> 「白井」== shirai@rdmg.mgcs.mei.co.jp (Hideyuki SHIRAI (白井秀行)) said as follows:

白井> こちらは、Encoding が UTF-8 でも使っている文字が日本製なら OK。
白井> euc-japan に mapping 出来れば表示出来る、ということです。

白井> # 例えば、上野さんの日記は UTF-8 だけど、"@" 以外は全部表示でき
白井> # るでしょ。

白井> euc-japan に mapping 出来ない文字に関しては、そういうことです。
白井> http://www.is.titech.ac.jp/~yagi/emacs/pringles.html は典型例で
白井> すよね。

という件に、対処できるように工夫してみました。ただし、Mule-UCS が必要
です。

この実装では、euc-japan で表現できない文字を探し出し、それらの文字を一
つずつ数値文字参照に置換してから、w3m -halfdump を呼び出し、その結果に
含まれている数値文字参照をもう一度置換する、という手続きを踏んでいます。

この中で euc-japan で表現できない文字を探す、という部分の実装がどうに
もうまくないような気がするので、commit を躊躇っているのですが、もっと
簡単な方法はないでしょうか?
Index: w3m.el
===================================================================
RCS file: /home/tsuchiya/cvsroot/emacs-w3m/w3m.el,v
retrieving revision 1.359
diff -u -u -r1.359 w3m.el
--- w3m.el	10 Aug 2001 08:43:41 -0000	1.359
+++ w3m.el	13 Aug 2001 02:29:55 -0000
@@ -81,6 +81,9 @@
 (require 'thingatpt)
 (require 'timezone)
 
+(eval-when-compile
+  (ignore-errors (require 'unicode)))
+
 ;; Add-on programs:
 (eval-and-compile
   (autoload 'w3m-bookmark-view "w3m-bookmark" nil t)
@@ -138,6 +141,14 @@
 		 (const :tag "w3m-m17n" 'w3m-m17n)
 		 (symbol :tag "other" nil)))
 
+(defcustom w3m-use-mule-ucs
+  (and (locate-library "unicode.el")
+       (eq w3m-type 'w3m))
+  "*Non nil means internationalization with mule-ucs."
+  :group 'w3m
+  :type 'boolean
+  :require 'unicode)
+
 (defcustom w3m-command
   (cond ((eq w3m-type 'w3mmee) "w3mmee")
 	((eq w3m-type 'w3m-m17n) "w3m-m17n")
@@ -660,7 +673,7 @@
 		 latin1-entity))))))
 (defconst w3m-entity-regexp
   (eval-when-compile
-    (format "&\\(%s\\|#[0-9]+\\);?"
+    (format "&\\(%s\\|#[0-9]+\\|#x[0-9a-f]+\\);?"
 	    (if (fboundp 'regexp-opt)
 		(let ((fn (function regexp-opt)))
 		  ;; Don't funcall directly for avoiding compile warning.
@@ -1367,13 +1380,16 @@
   (or (symbol-value (intern-soft name w3m-entity-db))
       (if (not (char-equal (string-to-char name) ?#))
 	  (concat "&" name)		; unknown entity
-	;; case of immediate character (accept only 0x20 .. 0x7e)
-	(let ((char (string-to-int (substring name 1))))
-	  ;; make character's representation with learning
-	  (set (intern name w3m-entity-db)
-	       (if (or (< char 32) (< 127 char))
-		   "~"			; un-supported character
-		 (char-to-string char)))))))
+	(setq name (substring name 1))
+	(let ((char (if (char-equal (string-to-char name) ?x)
+			(string-to-number (substring name 1) 16)
+		      (string-to-number name))))
+	  (if w3m-use-mule-ucs
+	      (char-to-string (ucs-to-char char))
+	    ;; case of immediate character (accept only 0x20 .. 0x7e)
+	    (if (or (< char 32) (< 127 char))
+		"~"			; un-supported character
+	      (char-to-string char)))))))
 
 (defun w3m-fontify-bold ()
   "Fontify bold characters in this buffer which contains half-dumped data."
@@ -2504,7 +2520,26 @@
        (write-repeat r0)))))
 
 (define-ccl-program w3m-euc-japan-encoder
-  `(1 (loop (read r0) (write-repeat r0))))
+  `(4
+    (loop
+     (read-multibyte-character r0 r1)
+     (if (r0 != 0)
+	 (if (r0 != ,(w3m-static-if (boundp 'MULE)
+			 lc-jp
+		       (charset-id 'japanese-jisx0208)))
+	     (if (r0 != ,(w3m-static-if (boundp 'MULE)
+			     lc-kana
+			   (charset-id 'katakana-jisx0201)))
+		 (if (r0 != ,(w3m-static-if (boundp 'MULE)
+				 lc-jp2
+			       (charset-id 'japanese-jisx0212)))
+		     ((write ?&)
+		      (write ?#)
+		      (write-multibyte-character r0 r1)
+		      (write ?\;)
+		      (repeat))))))
+     (write-multibyte-character r0 r1)
+     (repeat))))
 
 (unless (w3m-static-if (featurep 'xemacs)
 	    (find-coding-system 'w3m-euc-japan)
@@ -2552,6 +2587,16 @@
 	       ((member "next" rel) (setq w3m-next-url href))
 	       ((member "prev" rel) (setq w3m-previous-url href))))))))))
 
+(defun w3m-replace-non-euc-japan-characters ()
+  "Replace all characters, which are not members of euc-japan, into entities."
+  (encode-coding-region (point-min) (point-max) 'w3m-euc-japan)
+  (decode-coding-region (point-min) (point-max) 'emacs-mule)
+  (goto-char (point-min))
+  (while (search-forward "&#" nil t)
+    (when (looking-at "[^0-9];")
+      (insert (format "%d" (char-to-ucs (char-after))))
+      (delete-char 1))))
+
 (defun w3m-rendering-region (start end &optional charset)
   "Do rendering of contents in this buffer as HTML and return title."
   (save-restriction
@@ -2561,6 +2606,8 @@
     (w3m-check-link-tags)
     (when w3m-use-form
       (w3m-form-parse-region (point-min) (point-max) charset))
+    (when w3m-use-mule-ucs
+      (w3m-replace-non-euc-japan-characters))
     (w3m-message "Rendering...")
     (let ((coding-system-for-read w3m-output-coding-system)
 	  (coding-system-for-write w3m-input-coding-system)
-- 
土屋 雅稔  ( TSUCHIYA Masatoshi )
    http://www-nagao.kuee.kyoto-u.ac.jp/member/tsuchiya/