// ------------------------------------------------------------------ // GoldED+ // Copyright (C) 2003 Alexander S. Aganichev // ------------------------------------------------------------------ // This program is free software; you can redistribute it and/or // modify it under the terms of the GNU General Public License as // published by the Free Software Foundation; either version 2 of the // License, or (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, // MA 02111-1307 USA // ------------------------------------------------------------------ // $Id$ // ------------------------------------------------------------------ // HTML tag remover. // ------------------------------------------------------------------ #include // ------------------------------------------------------------------ const static struct html_entities { const char *tag; char replacement; } entities[] = { {"nbsp", ' '}, {"brvbar", '|'}, {"laquo", '<'}, {"shy", '-'}, {"raquo", '>'}, {"divide", '/'}, {"quot", '\"'}, {"amp", '&'}, {"lt", '<'}, {"gt", '>'} }; // ------------------------------------------------------------------ void RemoveHTML (char *&txt) { long i, j, len = strlen(txt) + 1; char *new_txt = (char *)throw_malloc(len); bool strip = false; bool quoted = false; bool inside_html = false; bool last_char_was_space = true; for(i = j = 0; txt[i] != NUL; i++) { if(not quoted and not strip and (txt[i] == '<')) { if(strnieql(txt + i, "", 7)) { inside_html = false; strip = true; } else if(not inside_html and (txt[i + 1] == '/')) { inside_html = true; // closing html tag, force html mode strip = true; } else if(inside_html) { strip = true; if(strnieql(txt + i, "", 3) or strnieql(txt + i, "", 4)) new_txt[j++] = '*'; if(strnieql(txt + i, "", 3) or strnieql(txt + i, "", 4)) new_txt[j++] = '/'; if(strnieql(txt + i, "", 3) or strnieql(txt + i, "", 4)) new_txt[j++] = '_'; if((strnieql(txt + i, "", 4) or strnieql(txt + i, "", 5) or strnieql(txt + i, "", 6) or strnieql(txt + i, "
", 4)) { new_txt[j++] = CR; } } else { new_txt[j++] = txt[i]; } } else if(not strip and not inside_html) { new_txt[j++] = txt[i]; } else if(strip and not quoted and (txt[i] == '>')) { strip = false; } else if(inside_html) { if(strip and (txt[1] == '\"')) { quoted = not quoted; } else if(not strip and (iscntrl(txt[i]) or (txt[i] == ' '))) { if((i > 0) && (txt[i - 1] == '=')) // compensate for quoted-printable new_txt[j++] = txt[i]; else if(not last_char_was_space) new_txt[j++] = ' '; last_char_was_space = true; } else if(not strip and (txt[i] == '&')) { bool found = false; for (int k = 0; k < (sizeof(entities) / sizeof(html_entities)); k++) { long taglen = strlen (entities[k].tag); if(strnieql (txt + i + 1, entities[k].tag, taglen)) { new_txt[j++] = entities[k].replacement; i += taglen + ((txt[i + taglen + 1] == ';') ? 1 : 0); found = true; break; } } if(not found) { new_txt[j++] = txt[i]; } last_char_was_space = false; } else if(not strip) { new_txt[j++] = txt[i]; last_char_was_space = false; } } } new_txt[j] = NUL; if (i != j) { txt = (char *)throw_realloc(txt, j + 17); memcpy(txt, new_txt, j + 1); } throw_free(new_txt); } // ------------------------------------------------------------------