Have you tried using this Parser ?
Include the rtf.php file somewhere in your project. Then do the following:
$reader = new RtfReader();
$rtf = file_get_contents("test.rtf"); // Ou uma string
$reader->Parse($rtf);
If you want to see what is being analyzed do:
$reader->root->dump();
To convert to HTML:
$formatter = new RtfHtml();
echo $formatter->Format($reader->root);
To save use some function that writes this content to a file, such as file_put_contents
.
Here's an example:
$html = $formatter->Format($reader->root);
file_put_contents('test.html', $html);
The rtf.php file:
/**
* RTF parser/formatter
*
* This code reads RTF files and formats the RTF data to HTML.
*
* PHP version 5
*
* @author Alexander van Oostenrijk
* @copyright 2014 Alexander van Oostenrijk
* @license GNU
* @version 1
* @link http://www.websofia.com
*
* Sample of use:
*
* $reader = new RtfReader();
* $rtf = file_get_contents("itc.rtf"); // or use a string
* $reader->Parse($rtf);
* //$reader->root->dump(); // to see what the reader read
* $formatter = new RtfHtml();
* echo $formatter->Format($reader->root);
*/
class RtfElement
{
protected function Indent($level)
{
for($i = 0; $i < $level * 2; $i++) echo " ";
}
}
class RtfGroup extends RtfElement
{
public $parent;
public $children;
public function __construct()
{
$this->parent = null;
$this->children = array();
}
public function GetType()
{
// No children?
if(sizeof($this->children) == 0) return null;
// First child not a control word?
$child = $this->children[0];
if(get_class($child) != "RtfControlWord") return null;
return $child->word;
}
public function IsDestination()
{
// No children?
if(sizeof($this->children) == 0) return null;
// First child not a control symbol?
$child = $this->children[0];
if(get_class($child) != "RtfControlSymbol") return null;
return $child->symbol == '*';
}
public function dump($level = 0)
{
echo "<div>";
$this->Indent($level);
echo "{";
echo "</div>";
foreach($this->children as $child)
{
if(get_class($child) == "RtfGroup")
{
if ($child->GetType() == "fonttbl") continue;
if ($child->GetType() == "colortbl") continue;
if ($child->GetType() == "stylesheet") continue;
if ($child->GetType() == "info") continue;
// Skip any pictures:
if (substr($child->GetType(), 0, 4) == "pict") continue;
if ($child->IsDestination()) continue;
}
$child->dump($level + 2);
}
echo "<div>";
$this->Indent($level);
echo "}";
echo "</div>";
}
}
class RtfControlWord extends RtfElement
{
public $word;
public $parameter;
public function dump($level)
{
echo "<div style='color:green'>";
$this->Indent($level);
echo "WORD {$this->word} ({$this->parameter})";
echo "</div>";
}
}
class RtfControlSymbol extends RtfElement
{
public $symbol;
public $parameter = 0;
public function dump($level)
{
echo "<div style='color:blue'>";
$this->Indent($level);
echo "SYMBOL {$this->symbol} ({$this->parameter})";
echo "</div>";
}
}
class RtfText extends RtfElement
{
public $text;
public function dump($level)
{
echo "<div style='color:red'>";
$this->Indent($level);
echo "TEXT {$this->text}";
echo "</div>";
}
}
class RtfReader
{
public $root = null;
protected function GetChar()
{
$this->char = $this->rtf[$this->pos++];
}
protected function ParseStartGroup()
{
// Store state of document on stack.
$group = new RtfGroup();
if($this->group != null) $group->parent = $this->group;
if($this->root == null)
{
$this->group = $group;
$this->root = $group;
}
else
{
array_push($this->group->children, $group);
$this->group = $group;
}
}
protected function is_letter()
{
if(ord($this->char) >= 65 && ord($this->char) <= 90) return TRUE;
if(ord($this->char) >= 90 && ord($this->char) <= 122) return TRUE;
return FALSE;
}
protected function is_digit()
{
if(ord($this->char) >= 48 && ord($this->char) <= 57) return TRUE;
return FALSE;
}
protected function ParseEndGroup()
{
// Retrieve state of document from stack.
$this->group = $this->group->parent;
}
protected function ParseControlWord()
{
$this->GetChar();
$word = "";
while($this->is_letter())
{
$word .= $this->char;
$this->GetChar();
}
// Read parameter (if any) consisting of digits.
// Paramater may be negative.
$parameter = null;
$negative = false;
if($this->char == '-')
{
$this->GetChar();
$negative = true;
}
while($this->is_digit())
{
if($parameter == null) $parameter = 0;
$parameter = $parameter * 10 + $this->char;
$this->GetChar();
}
if($parameter === null) $parameter = 1;
if($negative) $parameter = -$parameter;
// If this is \u, then the parameter will be followed by
// a character.
if($word == "u")
{
}
// If the current character is a space, then
// it is a delimiter. It is consumed.
// If it's not a space, then it's part of the next
// item in the text, so put the character back.
else
{
if($this->char != ' ') $this->pos--;
}
$rtfword = new RtfControlWord();
$rtfword->word = $word;
$rtfword->parameter = $parameter;
array_push($this->group->children, $rtfword);
}
protected function ParseControlSymbol()
{
// Read symbol (one character only).
$this->GetChar();
$symbol = $this->char;
// Symbols ordinarily have no parameter. However,
// if this is \', then it is followed by a 2-digit hex-code:
$parameter = 0;
if($symbol == '\'')
{
$this->GetChar();
$parameter = $this->char;
$this->GetChar();
$parameter = hexdec($parameter . $this->char);
}
$rtfsymbol = new RtfControlSymbol();
$rtfsymbol->symbol = $symbol;
$rtfsymbol->parameter = $parameter;
array_push($this->group->children, $rtfsymbol);
}
protected function ParseControl()
{
// Beginning of an RTF control word or control symbol.
// Look ahead by one character to see if it starts with
// a letter (control world) or another symbol (control symbol):
$this->GetChar();
$this->pos--;
if($this->is_letter())
$this->ParseControlWord();
else
$this->ParseControlSymbol();
}
protected function ParseText()
{
// Parse plain text up to backslash or brace,
// unless escaped.
$text = "";
do
{
$terminate = false;
$escape = false;
// Is this an escape?
if($this->char == '\')
{
// Perform lookahead to see if this
// is really an escape sequence.
$this->GetChar();
switch($this->char)
{
case '\': $text .= '\'; break;
case '{': $text .= '{'; break;
case '}': $text .= '}'; break;
default:
// Not an escape. Roll back.
$this->pos = $this->pos - 2;
$terminate = true;
break;
}
}
else if($this->char == '{' || $this->char == '}')
{
$this->pos--;
$terminate = true;
}
if(!$terminate && !$escape)
{
$text .= $this->char;
$this->GetChar();
}
}
while(!$terminate && $this->pos < $this->len);
$rtftext = new RtfText();
$rtftext->text = $text;
array_push($this->group->children, $rtftext);
}
public function Parse($rtf)
{
$this->rtf = $rtf;
$this->pos = 0;
$this->len = strlen($this->rtf);
$this->group = null;
$this->root = null;
while($this->pos < $this->len)
{
// Read next character:
$this->GetChar();
// Ignore \r and \n
if($this->char == "\n" || $this->char == "\r") continue;
// What type of character is this?
switch($this->char)
{
case '{':
$this->ParseStartGroup();
break;
case '}':
$this->ParseEndGroup();
break;
case '\':
$this->ParseControl();
break;
default:
$this->ParseText();
break;
}
}
}
}
class RtfState
{
public function __construct()
{
$this->Reset();
}
public function Reset()
{
$this->bold = false;
$this->italic = false;
$this->underline = false;
$this->end_underline = false;
$this->strike = false;
$this->hidden = false;
$this->fontsize = 0;
}
}
class RtfHtml
{
public function Format($root)
{
$this->output = "";
// Create a stack of states:
$this->states = array();
// Put an initial standard state onto the stack:
$this->state = new RtfState();
array_push($this->states, $this->state);
$this->FormatGroup($root);
return $this->output;
}
protected function FormatGroup($group)
{
// Can we ignore this group?
if ($group->GetType() == "fonttbl") return;
if ($group->GetType() == "colortbl") return;
if ($group->GetType() == "stylesheet") return;
if ($group->GetType() == "info") return;
// Skip any pictures:
if (substr($group->GetType(), 0, 4) == "pict") return;
if ($group->IsDestination()) return;
// Push a new state onto the stack:
$this->state = clone $this->state;
array_push($this->states, $this->state);
foreach($group->children as $child)
{
if(get_class($child) == "RtfGroup") $this->FormatGroup($child);
if(get_class($child) == "RtfControlWord") $this->FormatControlWord($child);
if(get_class($child) == "RtfControlSymbol") $this->FormatControlSymbol($child);
if(get_class($child) == "RtfText") $this->FormatText($child);
}
// Pop state from stack.
array_pop($this->states);
$this->state = $this->states[sizeof($this->states)-1];
}
protected function FormatControlWord($word)
{
if($word->word == "plain") $this->state->Reset();
if($word->word == "b") $this->state->bold = $word->parameter;
if($word->word == "i") $this->state->italic = $word->parameter;
if($word->word == "ul") $this->state->underline = $word->parameter;
if($word->word == "ulnone") $this->state->end_underline = $word->parameter;
if($word->word == "strike") $this->state->strike = $word->parameter;
if($word->word == "v") $this->state->hidden = $word->parameter;
if($word->word == "fs") $this->state->fontsize = ceil(($word->parameter / 24) * 16);
if($word->word == "par") $this->output .= "<p>";
// Characters:
if($word->word == "lquote") $this->output .= "‘";
if($word->word == "rquote") $this->output .= "’";
if($word->word == "ldblquote") $this->output .= "“";
if($word->word == "rdblquote") $this->output .= "”";
if($word->word == "emdash") $this->output .= "—";
if($word->word == "endash") $this->output .= "–";
if($word->word == "bullet") $this->output .= "•";
if($word->word == "u") $this->output .= "◊";
}
protected function BeginState()
{
$span = "";
if($this->state->bold) $span .= "font-weight:bold;";
if($this->state->italic) $span .= "font-style:italic;";
if($this->state->underline) $span .= "text-decoration:underline;";
if($this->state->end_underline) $span .= "text-decoration:none;";
if($this->state->strike) $span .= "text-decoration:strikethrough;";
if($this->state->hidden) $span .= "display:none;";
if($this->state->fontsize != 0) $span .= "font-size: {$this->state->fontsize}px;";
$this->output .= "<span style='{$span}'>";
}
protected function EndState()
{
$this->output .= "</span>";
}
protected function FormatControlSymbol($symbol)
{
if($symbol->symbol == '\'')
{
$this->BeginState();
$this->output .= htmlentities(chr($symbol->parameter), ENT_QUOTES, 'ISO-8859-1');
$this->EndState();
}
}
protected function FormatText($text)
{
$this->BeginState();
$this->output .= $text->text;
$this->EndState();
}
}
Update
Before doing Parsing check that the content to be converted indicates the format .rtf , with a regular expression we can see if a sequence has some traces of that format :
$texto = '{\rtf1\ansi\ansicpg1252\deff0\deflang2057{\fonttbl{\f0\fnil\fcharset0 Tahoma;}}';
if (preg_match("/(\{\})|}|(\\S+)/", $texto) > 0) {
// $texto parece ter o formato .rtf
} else {
// $texto parece ser texto puro
}
If the text is in .rtf format you do parsing otherwise return pure text.