2006-02-15

Hexifier and XML entities escaper

Happy belated Valentine's day.



I have a present for Emacser out there: a hexifier and XML Entities
escapers.



ys-hex306.el


(require 'hexl)

(defun ys/escape (string to-escape-char &optional escape-char)
  ;; Escapes all instances of to-escape-char in the given string by prefixing each instance with escape-char (default to backslash ``\``).
  ;;
  ;; (let ((str "/ \\/ \\\\/ \\n"))
  ;;   (insert (concat "\n" str "\n" (ys/escape str ?/))))
  ;; / \/ \\/ \n
  ;; \/ \\\/ \\\\\/ \\n



"  Escapes all instances of to-escape-char in the given string by prefixing each instance with escape-char (default to backslash ``\\``).
  
  (let ((str \"/ \\\\/ \\\\\\\\/ \\\\n\"))
    (insert (concat \"\\n\" str \"\\n\" (ys/escape str ?/))))
  / \\/ \\\\/ \\n
  \\/ \\\\\\/ \\\\\\\\\\/ \\\\n
"
  (if (eq escape-char nil)
      (setq escape-char ?\\))
  (with-temp-buffer
    (insert string)
    (goto-char (point-min))
    (save-match-data
      (let ((regexp (concat (regexp-quote (char-to-string to-escape-char))
                                   "\\|"
                                   (regexp-quote (char-to-string escape-char)))))
        (while (re-search-forward regexp (point-max) t)
          (replace-match (concat (char-to-string escape-char)
                                 (match-string 0)) t t))))
    (buffer-substring (point-min) (point-max))))











(defun ys/unescape (string &optional escape-char) 
  ;; Converts ``\/ \n \\\/`` to ``/ n \/``. Basically converts ``\x`` to ``x`` where x is any single character and \ is the escape-char."
  ;; (let ((str "/ \\/ \\\\/ \\n"))
  ;;   (insert (concat "\n" str
  ;;              "\n" (ys/escape str ?/)
  ;;              "\n" (ys/unescape (ys/escape str ?/)))))
  ;; / \/ \\/ \n
  ;; \/ \\\/ \\\\\/ \\n
  ;; / \/ \\/ \n


"  Converts ``\\/ \\n \\\\\\/`` to ``/ n \\/``. Basically converts ``\\x`` to ``x`` where x is any single character and \\ is the escape-char.\"
  (let ((str \"/ \\\\/ \\\\\\\\/ \\\\n\"))
    (insert (concat \"\\n\" str
                  \"\\n\" (ys/escape str ?/)
                  \"\\n\" (ys/unescape (ys/escape str ?/)))))
  / \\/ \\\\/ \\n
  \\/ \\\\\\/ \\\\\\\\\\/ \\\\n
  / \\/ \\\\/ \\n

"
  (if (eq escape-char nil)
      (setq escape-char ?\\))
  (with-temp-buffer
    (insert string)
    (goto-char (point-min))
    (save-match-data
      (let ((regexp (concat (regexp-quote (char-to-string escape-char))
                            "\\(.\\)")))
        (while (re-search-forward regexp (point-max) t)
          (replace-match (match-string 1) t t))))
    (buffer-substring (point-min) (point-max))))
  





(defun ys/hexstring-to-charstring (hexstring)
;(insert (concat "\n" (ys/hexstring-to-charstring "039900000000000000000000000046")))
; ™������������F
"(insert (concat \"\\n\" (ys/hexstring-to-charstring \"039900000000000000000000000046\")))
 \231������������F
"
  (let ((expected-length (/ (length hexstring) 2)))
    (save-excursion
        (let ((result 
               (with-temp-buffer
                 (insert hexstring)
                 (goto-char (point-min))
                 (save-match-data
                   (while (and (not (eobp))
                               (looking-at "\\([0-9A-Fa-f]\\)\\([0-9A-Fa-f]\\)"))
                     (replace-match (char-to-string (hexl-htoi (string-to-char (match-string 1))
                                                               (string-to-char (match-string 2)))) 
                                    t 
                                    t)))
                 (buffer-substring (point-min) (point-max)))))
          ;; check if the conversion is valid
          (let ((result-length (length result)))
            (if (/= result-length expected-length)
                (error "Expected a charstring of length %d, but instead the length is %d" expected-length result-length)))
          result))))



(defun ys/charstring-to-hexstring (charstring)
                                        ;(insert (concat "\n" (ys/charstring-to-hexstring " ™������������F")))
                                        ;039900000000000000000000000046
  "(insert (concat \"\\n\" (ys/charstring-to-hexstring \" \231������������F\")))
039900000000000000000000000046

"
  (mapconcat #'(lambda (char) 
                 (format "%02x" char))
             charstring
             ""))





(defun ys/hex306string-to-char306string (hex306string)
                                        ;(insert (concat "\n" (ys/hex306string-to-char306string "303132332f636f6e7665727420746869732f2062757420/don't convert this/2066696e616c6c79202f636f6e7665727420746869732f")))
                                        ;0123\/convert this\/ but /don't convert this/ finally \/convert this\/
  "(insert (concat \"\\n\" (ys/hex306string-to-char306string \"303132332f636f6e7665727420746869732f2062757420/don't convert this/2066696e616c6c79202f636f6e7665727420746869732f\")))
0123\\/convert this\\/ but /don't convert this/ finally \\/convert this\\/
"
  (save-excursion
    (with-temp-buffer
        (insert hex306string)
        (goto-char (point-min))
        (save-match-data
          (while (and (not (eobp)) ;; needed because the regexp below won't fail due to *
                      (re-search-forward "\\([0-9A-Fa-f]*\\)" (point-max) t))
                                        ;(message (match-string 0))
            (let* ((hexstring (match-string 0))
                   (charstring (ys/hexstring-to-charstring (match-string 0)))
                   (escapedcharstring (ys/escape charstring ?/)))
                                        ;(message (format "hexstring:%s\ncharstring:%s\nescaped:%s" hexstring charstring escapedcharstring))
              (replace-match escapedcharstring t t))
                                        ;(message (format "reminder:%s" (buffer-substring (point) (point-max))))
            (if (looking-at "/.*?/") 
                (goto-char (match-end 0))
              (if (< (point) (point-max))
                  (error "Invalid hex306 string. Encountered a non-hexchar char but also not surrounded with //")))))
        (buffer-substring (point-min) (point-max)))))





(defun ys/char306string-to-hex306string (char306string)
;(insert (concat "\n" (ys/char306string-to-hex306string "\n0123\\/convert this\\/ but /don't convert this/ finally \\/convert this\\/")))
;0a303132332f636f6e7665727420746869732f2062757420/don't convert this/2066696e616c6c79202f636f6e7665727420746869732f

  "(insert (concat \"\\n\" (ys/char306string-to-hex306string \"\\n0123\\\\/convert this\\\\/ but /don't convert this/ finally \\\\/convert this\\\\/\")))
0a303132332f636f6e7665727420746869732f2062757420/don't convert this/2066696e616c6c79202f636f6e7665727420746869732f

"
  (save-excursion
    (with-temp-buffer 
      (insert char306string)
      (goto-char (point-min))
      (save-match-data
        (let ((charstring-begin (point-min)))
          (while (not (eobp)) 
            (cond ((looking-at "\\\\.") ; skip over escaped char. we must consume escaped char first so when we scan for a don't-convert-region, we won't get a false positive
                   )
                  ((looking-at "/\\(.*?\\)/") ; leave the don't-convert-this-region alone, but convert previous characters to hex
                   (let ((charstring (buffer-substring charstring-begin (match-beginning 0))))
                     (re-search-backward (regexp-quote charstring))
                     (replace-match (ys/charstring-to-hexstring (ys/unescape charstring)))
                                        ;(message (format "remaining:%s" (buffer-substring (point) (point-max))))
                                        ; the match-data info would have changed (because the length of charstring changed) , so we re-run the search
                     (if (looking-at "/\\(.*?\\)/")
                         (setq charstring-begin (match-end 0))
                       (error "Lost pointer to the don't-convert-region"))))
                  ((looking-at ".\\|\n") ; skip over other char
                   )
                  (t (error "Shouldn't have happened")))
            ;(message (format "going to %d/%d, remaining:%s" (match-end 0) (point-max) (buffer-substring (match-end 0) (point-max))))
            (goto-char (match-end 0)))
          (let ((charstring (buffer-substring charstring-begin (match-end 0))))
            (re-search-backward (regexp-quote charstring))
            (replace-match (ys/charstring-to-hexstring (ys/unescape charstring))))))
      ;(message (format "End-state:%s" (buffer-substring (point-min) (point-max))))
      (buffer-substring (point-min) (point-max)))))








;; ys/charstring-to-hexstring-region and ys/hexstring-to-charstring-region converts between these two forms:
;; 0123\/convert this\/ but /don't convert this/ finally \/convert this\/ 
;; 303132335c2f636f6e7665727420746869735c2f20627574202f646f6e277420636f6e7665727420746869732f2066696e616c6c79205c2f636f6e7665727420746869735c2f


(defun ys/hexstring-to-charstring-region (region-begin region-end)
  "Converts ``039900000000000000000000000046`` to `` ™������������F``"
  (interactive "r")
  (save-excursion
    (let ((charstring (ys/hexstring-to-charstring (buffer-substring region-begin region-end))))
      (delete-region region-begin region-end)
      (goto-char region-begin)
      (insert charstring))))


(defun ys/charstring-to-hexstring-region (region-begin region-end)
  "Converts `` ™������������F`` to ``039900000000000000000000000046``"
  (interactive "r")
  (save-excursion
    (let* ((charstring (buffer-substring region-begin region-end))
           (hexstring (ys/charstring-to-hexstring charstring)))
      (delete-region region-begin region-end)
      (goto-char region-begin)
      (insert hexstring))))




;; ys/char306string-to-hex306string-region and ys/hex306string-to-char306string-region converts between these two forms:
;; 0123\/convert this\/ but /don't convert this/ finally \/convert this\/
;; 303132332f636f6e7665727420746869732f2062757420/don't convert this/2066696e616c6c79202f636f6e7665727420746869732f

(defun ys/char306string-to-hex306string-region (region-begin region-end)
  "Converts ``0123\/convert this\/ but /don't convert this/ finally \/convert this\/`` to 
``303132332f636f6e7665727420746869732f2062757420/don't convert this/2066696e616c6c79202f636f6e7665727420746869732f``"
  (interactive "r")
  (save-excursion (let* ((char306string (buffer-substring region-begin region-end))
                         (hex306string (ys/char306string-to-hex306string char306string)))
                    (delete-region region-begin region-end)
                    (goto-char region-begin)
                    (insert hex306string))))

(defun ys/hex306string-to-char306string-region (region-begin region-end)
  "Converts ``303132332f636f6e7665727420746869732f2062757420/don't convert this/2066696e616c6c79202f636f6e7665727420746869732f`` to
``0123\/convert this\/ but /don't convert this/ finally \/convert this\/``"
  (interactive "r")
  (save-excursion
    (let ((char306string (ys/hex306string-to-char306string (buffer-substring region-begin region-end))))
      (delete-region region-begin region-end)
      (goto-char region-begin)
      (insert char306string))))



(provide 'ys-hex306)


ys-xml-escape.el

(require 'cl)
(defun ys/xml-escape-entities-and-non-printable-ascii (string)
  "Escapes XML entities and any non-ascii characters and also ascii characters that are not ``printable`` ( 32 < x < 126 )."
  (mapconcat 
   #'(lambda (char)
       (case char
         (?< "&lt;")
         (?> "&gt;")
         (?& "&amp;")
         (?' "&apos;")
         (?\" "&quot;")
         (t  (if (and (<= 32 char)
                      (<= char 126))
                 (char-to-string char)
               (format "&#%02d;" char)))))
   string
   ""))
;(insert (concat "\n" (ys/xml-escape-entities-and-non-printable-ascii "<goo&ten, \"'night\", he said>�")))
;&lt;goo&amp;ten, &quot;&apos;night&quot;, he said&gt;&#00;

(defun ys/xml-unescape-entities (string)
  "Unescapes XML entities"

  (save-excursion
    (with-temp-buffer
      (insert string)
      (goto-char (point-min))
      (while (not (eobp))
        (cond ((looking-at "&#\\([[:digit:]]+?\\);")
               (let ((char (string-to-number (match-string 1) 16)))
                 (replace-match (char-to-string char))
                 (goto-char (match-end 0))))
              ((looking-at "&\\(.+?\\);") 
               (let ((entity (match-string 1)))
                 (replace-match (case (intern entity)
                                  ('lt "<")
                                  ('gt ">")
                                  ('amp "&")
                                  ('apos "'")
                                  ('quot "\"")
                                  (t (error "Unknown XML entity: %s" entity))))))
              (t (goto-char (+ 1 (point))))))
      (buffer-substring (point-min) (point-max)))))
;(insert (concat "\n" (ys/xml-unescape-entities "&lt;goo&amp;ten, &quot;&apos;night&quot;, he said&gt;&#00;")))
;<goo&ten, "'night", he said>�

(eql (intern "lt") 'lt)


(defun ys/xml-escape-entities-and-non-printable-ascii-region (region-begin region-end)
  "Escapes XML entities and any non-ascii characters and also ascii characters that are not ``printable``."
  (interactive "r")
  (save-excursion
    (let ((escaped (ys/xml-escape-entities-and-non-printable-ascii (buffer-substring region-begin region-end))))
      (delete-region region-begin region-end)
      (goto-char region-begin)
      (insert escaped))))

(defun ys/xml-unescape-entities-region (region-begin region-end)
  "Unescapes XML entities"
  (interactive "r")
  (save-excursion
    (let ((unescaped (ys/xml-unescape-entities (buffer-substring region-begin region-end))))
      (delete-region region-begin region-end)
      (goto-char region-begin)
      (insert unescaped))))

(provide 'ys-xml-escape)

What I put into my .emacs for these two utils:


(require 'ys-hex306)
(global-set-key "\C-chc" 'ys/hex306string-to-char306string-region)
(global-set-key "\C-chh" 'ys/char306string-to-hex306string-region)
(require 'ys-xml-escape)
(global-set-key "\C-cxe"
'ys/xml-escape-entities-and-non-printable-ascii-region)
(global-set-key "\C-cxu" 'ys/xml-unescape-entities-region)

(defun ys/quote-for-docstring (region-begin region-end)
  (interactive "r")
  (save-excursion
    (let* ((original-mode-name mode-name)
           (commented (buffer-substring-no-properties region-begin
  region-end))
           (uncommented (with-temp-buffer
                          (funcall (symbol-function (intern (format
  "%s-mode" (downcase original-mode-name)))))
                          (insert commented)
                          (uncomment-region (point-min) (point-max))
                          (buffer-substring-no-properties (point-min)
  (point-max))))
           (quoted (with-output-to-string (print uncommented))))
      (insert quoted))))


The function ys/quote-for-docstring= is especially handy for
constructing a docstring with a lot of backslashes.


(originally from http://microjet.ath.cx/WebWiki/2006.02.15_Hexifier_and_XML_Entities_Escaper.html)