HTML Parser

For parsing text+html, I am using Nokogiri. And below are sample code.
  
  
 my_html = "this is some html"
 tokens = HTML::Tokenizer.new(my_html)

 tags = []
 while token = tokens.next
 node = HTML::Node.parse(nil, 0, 0, token, false)
 tags << node if node.tag? and node.closing != :close
 end

 tags.first.name # => "div"
 tags.first.attributes # => {"class"=>"content doublespace", "id"=>"main-content"}

  
Above code is referred in Ruby on Rails Gotchas. I created a code below and I explained it how it works. First, I defined a method and named it as html_parse with parameter. Then initialized all variables that I need. I put it as global so I can access it entire the whole method.
  
  
  json_content = {}
  collect_models = []
  msg = ""
  user_id = ""
  tags = []        
  
Then, let's initialize the string html. And let us parse it and pass to tags array.
  
  
  tokens = HTML::Tokenizer.new(str_html)

  while token = tokens.next
  node = HTML::Node.parse(nil, 0, 0, token, false)
  tags << node
  end
  
And, loop the tags array.
  
  
 (0..(tags.size - 1).to_i).each do |i|
      :
      :
      :
      :
      :
  end
  
Inside of the loop there's a couple of conditions for you to determine what per array contains. If it is a TEXT or TAGS.
  
  
 if tags[i].class == HTML::Text
    if not user_id.blank?
    msg += "#{tags[i].content}"
    collect_models << {name: "{fa-child #ffffff}#{tags[i].content}", id: user_id,  type: 'User', value: tags[i].content}
    user_id = ""
    else
     msg += tags[i].content
    end
  end

  if tags[i].class == HTML::Tag
   if not tags[i].attributes.nil?
   if tags[i].attributes["user_id"].nil? == false
    user_id = tags[i].attributes["user_id"]
    msg += "{fa-child #ffffff}"
   end

   if tags[i].attributes["model_id"].nil? == false
    model_id = tags[i].attributes["model_id"]
    @model = Model.find(model_id)
    model_name = @model.try(:name)
    icon = @model.dependent_model.try(:icon)
    model_type_name = @model.dependent_model.try(:name)
    msg += "{#{icon} #fff} "
    collect_models << {name: "{#{icon} #000} #{model_name}", id: model_id,  type: model_type_name, value: model_name}
   end
   end
  end
  
Then, at the end it returns the whole response via json response.
  
  
 json_content = {message: msg, models: collect_models}
  
Here is the full code.
  
  
  def self.html_parse(str_html, app_url)
  json_content = {}
  collect_models = []
  msg = ""
  user_id = ""
  tags = []

  tokens = HTML::Tokenizer.new(str_html)

  while token = tokens.next
   node = HTML::Node.parse(nil, 0, 0, token, false)
   tags << node
  end

  (0..(tags.size - 1).to_i).each do |i|
   if tags[i].class == HTML::Text
    if not user_id.blank?
     msg += "#{tags[i].content}"
     collect_models << {name: "{fa-child #ffffff}#{tags[i].content}", id: user_id,  type: 'User', value: tags[i].content}
     user_id = ""
    else
     msg += tags[i].content
    end
   end

   if tags[i].class == HTML::Tag
    if not tags[i].attributes.nil?
     if tags[i].attributes["user_id"].nil? == false
      user_id = tags[i].attributes["user_id"]
      msg += "{fa-child #ffffff}"
     end

     if tags[i].attributes["model_id"].nil? == false
      model_id = tags[i].attributes["model_id"]
      @model = Model.find(model_id)
      model_name = @model.try(:name)
      icon = @model.dependent_model.try(:icon)
      model_type_name = @model.dependent_model.try(:name)
      msg += "{#{icon} #fff} "
      collect_models << {name: "{#{icon} #000} #{model_name}", id: model_id,  type: model_type_name, value: model_name}
     end
    end
   end
  end

  json_content = {message: msg, models: collect_models}

  end     
  

No comments:

Post a Comment