This repository has been archived on 2024-04-08. You can view files and clone it, but cannot push or open issues or pull requests.
deb-goldedplus/golded3/gehtml.cpp
Alexander S. Aganichev d52f9d1109 Fix CR/LF problem
2003-08-09 10:15:02 +00:00

140 lines
4.4 KiB
C++
Executable File

// ------------------------------------------------------------------
// GoldED+
// Copyright (C) 2003 Alexander S. Aganichev
// ------------------------------------------------------------------
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License as
// published by the Free Software Foundation; either version 2 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston,
// MA 02111-1307 USA
// ------------------------------------------------------------------
// $Id$
// ------------------------------------------------------------------
// HTML tag remover.
// ------------------------------------------------------------------
#include <golded.h>
// ------------------------------------------------------------------
const static struct html_entities {
const char *tag;
char replacement;
}
entities[] = {
{"nbsp", ' '},
{"brvbar", '|'},
{"laquo", '<'},
{"shy", '-'},
{"raquo", '>'},
{"divide", '/'},
{"quot", '\"'},
{"amp", '&'},
{"lt", '<'},
{"gt", '>'}
};
// ------------------------------------------------------------------
void RemoveHTML (char *&txt) {
long i, j, len = strlen(txt) + 1;
char *new_txt = (char *)throw_malloc(len);
bool strip = false;
bool quoted = false;
bool inside_html = false;
bool last_char_was_space = true;
for(i = j = 0; txt[i] != NUL; i++) {
if(not quoted and not strip and (txt[i] == '<')) {
if(strnieql(txt + i, "<html", 5) or strnieql(txt + i, "<!DOCTYPE", 9)
or strnieql(txt + i, "<!--", 4)) {
inside_html = true;
strip = true;
}
else if(strnieql(txt + i, "</html>", 7)) {
inside_html = false;
strip = true;
}
else if(not inside_html and (txt[i + 1] == '/')) {
inside_html = true; // closing html tag, force html mode
strip = true;
}
else if(inside_html) {
strip = true;
if(strnieql(txt + i, "<b>", 3) or strnieql(txt + i, "</b>", 4))
new_txt[j++] = '*';
if(strnieql(txt + i, "<i>", 3) or strnieql(txt + i, "</i>", 4))
new_txt[j++] = '/';
if(strnieql(txt + i, "<u>", 3) or strnieql(txt + i, "</u>", 4))
new_txt[j++] = '_';
if((strnieql(txt + i, "</h", 3) and isdigit(txt[i + 3]))
or strnieql(txt + i, "</p>", 4) or strnieql(txt + i, "</tr>", 5)
or strnieql(txt + i, "</div>", 6) or strnieql(txt + i, "<br>", 4)) {
new_txt[j++] = CR;
}
}
else {
new_txt[j++] = txt[i];
}
}
else if(not strip and not inside_html) {
new_txt[j++] = txt[i];
}
else if(strip and not quoted and (txt[i] == '>')) {
strip = false;
}
else if(inside_html) {
if(strip and (txt[1] == '\"')) {
quoted = not quoted;
}
else if(not strip and (iscntrl(txt[i]) or (txt[i] == ' '))) {
if((i > 0) && (txt[i - 1] == '=')) // compensate for quoted-printable
new_txt[j++] = txt[i];
else if(not last_char_was_space)
new_txt[j++] = ' ';
last_char_was_space = true;
}
else if(not strip and (txt[i] == '&')) {
bool found = false;
for (int k = 0; k < (sizeof(entities) / sizeof(html_entities)); k++) {
long taglen = strlen (entities[k].tag);
if(strnieql (txt + i + 1, entities[k].tag, taglen)) {
new_txt[j++] = entities[k].replacement;
i += taglen + ((txt[i + taglen + 1] == ';') ? 1 : 0);
found = true;
break;
}
}
if(not found) {
new_txt[j++] = txt[i];
}
last_char_was_space = false;
}
else if(not strip) {
new_txt[j++] = txt[i];
last_char_was_space = false;
}
}
}
new_txt[j] = NUL;
if (i != j) {
txt = (char *)throw_realloc(txt, j + 17);
memcpy(txt, new_txt, j + 1);
}
throw_free(new_txt);
}
// ------------------------------------------------------------------