[Date Prev][Date Next][Thread Prev][][Date Index][Thread Index]

uri encode (was: quick search doesn't work for waei:, etc.)



>> On Wed, 09 Jul 2003 23:06:11 +0900
>> 「土」== tsuchiya@pine.kuee.kyoto-u.ac.jp (TSUCHIYA Masatoshi) said as follows:

土> しかし,本来,url は文字列として扱われるべきものですから,内部的に
土> は常に文字列として扱い,ネットワークに送信する(w3m コマンドを呼び
土> 出す)直前に encode するべきだと思います.しかし,それを実現するた
土> めには,その url を encode するのに適切な coding-system を常に文字
土> 列とペアで保持しておくように改善する必要があります.このような大き
土> な変更を emacs-w3m-1_3 枝に加えるのは,とても面倒なので,とりあえ
土> ず先送りして,adhoc な対処で済ませています.

山> text-props は使えるかしら。

土> 少し検討してみます.

検討してみました.意外と簡単な変更で出来たのですが,今度は別の問題で困っ
ています.

つまり,arrived DB には文字列の URI を記録するべきか,それとも,encode 
済みの octet 列の URI を記録するべきか,という問題です.添付のパッチを
適用して,

    http://example/日本語ファイル名

にアクセスすると,実際には

    (w3m-uri-transfer-encode-string "http://example/日本語ファイル名")
    => "http://example/%93%fa%96%7b%8c%ea%83%74%83%40%83%43%83%8b%96%bc"

にアクセスしにいくので,encode 済みの方が real URL という扱いになり,
arrived DB には両方の URL の記録が残ります.

ユーザーの入力を考えると,emacs-w3m 内部では URI は文字列として扱うべ
きだとは思いますから,必然的に arrived DB も文字列の場合だけを記録に残
すようにするべきだ,ということになるのですけれど,そうすると w3m の実
行結果をなんとか解釈し直す必要があって,悩んでいます.

何か,簡単で良い方法はないでしょうか?

-- 
土屋 雅稔 ( TSUCHIYA Masatoshi )

Index: w3m-form.el
===================================================================
RCS file: /storage/cvsroot/emacs-w3m/w3m-form.el,v
retrieving revision 1.106
diff -u -r1.106 w3m-form.el
--- w3m-form.el	18 Jun 2003 02:07:17 -0000	1.106
+++ w3m-form.el	23 Jul 2003 15:22:16 -0000
@@ -448,11 +448,11 @@
 					(enctype :case-ignore)
 					(charset :case-ignore))
 	    (when action
-	      (setq action (w3m-url-transfer-encode-string
-			    (w3m-decode-anchor-string action)
-			    (if charset
-				(w3m-charset-to-coding-system charset)
-			      w3m-current-coding-system))))
+	      (setq action (w3m-decode-anchor-string action))
+	      (setf (w3m-uri-transfer-coding-system action)
+		    (if charset
+			(w3m-charset-to-coding-system charset)
+		      w3m-current-coding-system)))
 	    (setq forms
 		  (cons
 		   (cons
Index: w3m-util.el
===================================================================
RCS file: /storage/cvsroot/emacs-w3m/w3m-util.el,v
retrieving revision 1.43
diff -u -r1.43 w3m-util.el
--- w3m-util.el	15 Jul 2003 04:26:50 -0000	1.43
+++ w3m-util.el	23 Jul 2003 15:22:16 -0000
@@ -345,15 +345,7 @@
     (sort (w3m-list-buffers t)
 	  (function w3m-buffer-name-lessp))))
 
-
 ;;; Miscellaneous:
-(defconst w3m-url-fallback-base "http:///")
-(defconst w3m-url-invalid-regexp "\\`http:///")
-
-(defsubst w3m-url-valid (url)
-  (and url (not (string-match w3m-url-invalid-regexp url))
-       url))
-
 (defmacro w3m-tag-regexp-of (&rest names)
   "Return a regexp string, not a funtion form.  A regexp should match tags
 which are started with \"<\" and one of NAMES.  NAMES should be string
@@ -378,6 +370,40 @@
   (+ (* (- (car end) (car start)) 65536)
      (cadr end)
      (- (cadr start))))
+
+;;; URI handling functions:
+(defsubst w3m-uri-set-properties (uri properties)
+  "Add properties to a given URI, and return it."
+  (set-text-properties 0 (length uri) nil uri)
+  (put-text-property 0 (length uri) 'w3m-uri-properties properties uri))
+
+(defsetf w3m-uri-properties w3m-uri-set-properties)
+(defsubst w3m-uri-properties (uri)
+  "Return the properties associated to a given URI."
+  (get-text-property 0 'w3m-uri-properties uri))
+
+(defsubst w3m-uri-put (uri propname value)
+  "Store URI's PROPNAME property with value VALUE."
+  (setf (w3m-uri-properties uri)
+	(plist-put (w3m-uri-properties uri) propname value))
+  value)
+
+(defsetf w3m-uri-get w3m-uri-put)
+(defsubst w3m-uri-get (uri propname)
+  "Return the value of URI's PROPNAME property."
+  (plist-get (w3m-uri-properties uri) propname))
+
+(defmacro w3m-uri-transfer-coding-system (uri)
+  "Return the coding system to transfer a given URI."
+  `(w3m-uri-get ,uri 'transfer-coding-system))
+
+;; MEMO: 以下の w3m-url-* 関数群は,徐々に w3m-uri-* に名前を統一する予定.
+(defconst w3m-url-fallback-base "http:///")
+(defconst w3m-url-invalid-regexp "\\`http:///")
+
+(defsubst w3m-url-valid (url)
+  (and url (not (string-match w3m-url-invalid-regexp url))
+       url))
 
 (defsubst w3m-url-local-p (url)
   "If URL points a file on the local system, return non-nil value.
Index: w3m.el
===================================================================
RCS file: /storage/cvsroot/emacs-w3m/w3m.el,v
retrieving revision 1.831
diff -u -r1.831 w3m.el
--- w3m.el	17 Jul 2003 14:52:36 -0000	1.831
+++ w3m.el	23 Jul 2003 15:22:16 -0000
@@ -2328,29 +2328,34 @@
 	 w3m-coding-system
 	 'iso-2022-7bit))))
 
-(defsubst w3m-url-transfer-encode-string (url &optional coding)
-  "Encode all non-ASCII characters included in URL to sequences of
+;; MEMO: query 部分は,上の w3m-url-encode-string() で encode し,残る
+;; 部分のみを送信用に encode するべきなのか?
+(defsubst w3m-uri-transfer-encode-string (uri &optional coding)
+  "Encode all non-ASCII characters included in URI to sequences of
 escaped octets in the specified coding system.
-This function is designed for conversion for safe transmission of URL.
+This function is designed for conversion for safe transmission of URI.
 Therefore, this function handles only non-ASCII characters that can
 not be transmitted safely with network streams.  In general, you
 should use `w3m-url-encode-string' instead of this."
   (let ((start 0)
 	(buf))
-    (while (string-match "[^\x21-\x7e]+" url start)
+    (unless coding
+      (setq coding (or (w3m-uri-transfer-coding-system uri)
+		       w3m-default-coding-system
+		       w3m-coding-system
+		       'iso-2022-7bit)))
+    (while (string-match "[^\x21-\x7e]+" uri start)
       (setq buf
 	    (cons (apply 'concat
 			 (mapcar
 			  (lambda (c) (format "%%%02x" c))
-			  (append (encode-coding-string
-				   (match-string 0 url)
-				   (or coding
-				       w3m-current-coding-system)))))
-		  (cons (substring url start (match-beginning 0))
+			  (append (encode-coding-string (match-string 0 uri)
+							coding))))
+		  (cons (substring uri start (match-beginning 0))
 			buf))
 	    start (match-end 0)))
     (apply 'concat
-	   (nreverse (cons (substring url start) buf)))))
+	   (nreverse (cons (substring uri start) buf)))))
 
 
 ;;; HTML character entity handling:
@@ -2456,8 +2461,7 @@
     (while (re-search-forward "<a[ \t\r\f\n]+" nil t)
       (setq start (match-beginning 0))
       (setq prenames (get-text-property start 'w3m-name-anchor))
-      (w3m-parse-attributes (href name charset
-				  (rel :case-ignore) (hseq :integer))
+      (w3m-parse-attributes (href name (rel :case-ignore) (hseq :integer))
 	(when rel
 	  (setq rel (split-string rel))
 	  (cond
@@ -2472,18 +2476,11 @@
 	  (when (re-search-forward "[ \t\r\f\n]*\\(</a>\\)" nil t)
 	    (setq end (match-beginning 0))
 	    (delete-region (match-beginning 1) (match-end 1))
-	    (setq href (w3m-expand-url (w3m-decode-anchor-string href)))
-	    (setq href (if (and (string-match w3m-url-components-regexp href)
-				(match-beginning 8))
-			   (concat (w3m-url-transfer-encode-string
-				    (substring href 0 (match-beginning 8))
-				    (w3m-charset-to-coding-system charset))
-				   "#" (match-string 9 href))
-			 (w3m-url-transfer-encode-string
-			  href
-			  (w3m-charset-to-coding-system charset)))
+	    (setq href (w3m-expand-url (w3m-decode-anchor-string href))
 		  hseq (or (and (null hseq) 0) (abs hseq))
 		  w3m-max-anchor-sequence (max hseq w3m-max-anchor-sequence))
+	    (setf (w3m-uri-transfer-coding-system href)
+		  w3m-current-coding-system)
 	    (w3m-add-text-properties start end
 				     (list 'face (if (w3m-arrived-p href)
 						     'w3m-arrived-anchor-face
@@ -3454,7 +3451,8 @@
 	(w3m-process-do-with-temp-buffer
 	    (success (progn
 		       (setq w3m-current-url url
-			     url (w3m-url-strip-authinfo url))
+			     url (w3m-uri-transfer-encode-string
+				  (w3m-url-strip-authinfo url)))
 		       (w3m-process-start handler
 					  w3m-command
 					  (append w3m-command-arguments
@@ -3568,7 +3566,7 @@
 			     w3m-command-arguments
 			     (w3m-w3m-expand-arguments
 			      w3m-dump-head-source-command-arguments)
-			     (list url))))
+			     (list (w3m-uri-transfer-encode-string url)))))
       (w3m-message "Reading %s...done" url)
       (when result
 	(goto-char (point-min))
@@ -6006,11 +6004,7 @@
     nil ;; referer
     nil ;; handler
     t)) ;; qsearch
-  (set-text-properties 0 (length url) nil url)
   (setq url (w3m-uri-replace url))
-  (unless (or (w3m-url-local-p url)
-	      (string-match "\\`about:" url))
-    (setq url (w3m-url-transfer-encode-string url w3m-default-coding-system)))
   (cond
    ;; process mailto: protocol
    ((string-match "\\`mailto:" url)