You might find it useful to extract only the text from a document. This is helpful
if you're passing the text to another API service. All the text in a document is
contained across its tabs in
text runs of
paragraph elements.
Extracting all the text in a document involves traversing the tabs tree
hierarchy and calling getter methods off of Tab
and DocumentTab
. See
Work with Tabs for more information on the tabs
feature.
Text can appear in 3 types of the document tab's structural elements:
- Paragraph
- Table of Contents
- Tables
Tables can be nested inside another table. Therefore, to extract all the text in a document, you must visit each nested structural element.
For a full description of the document body, see the Document Structure guide.
The following Google Docs API sample uses recursion to visit each structural element in all tabs of a document and prints the text.
Java
// Copyright 2019 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. import com.google.api.client.auth.oauth2.Credential; import com.google.api.client.extensions.java6.auth.oauth2.AuthorizationCodeInstalledApp; import com.google.api.client.extensions.jetty.auth.oauth2.LocalServerReceiver; import com.google.api.client.googleapis.auth.oauth2.GoogleAuthorizationCodeFlow; import com.google.api.client.googleapis.auth.oauth2.GoogleClientSecrets; import com.google.api.client.googleapis.javanet.GoogleNetHttpTransport; import com.google.api.client.http.javanet.NetHttpTransport; import com.google.api.client.json.JsonFactory; import com.google.api.client.json.jackson2.JacksonFactory; import com.google.api.client.util.store.FileDataStoreFactory; import com.google.api.services.docs.v1.Docs; import com.google.api.services.docs.v1.DocsScopes; import com.google.api.services.docs.v1.model.Document; import com.google.api.services.docs.v1.model.DocumentTab; import com.google.api.services.docs.v1.model.ParagraphElement; import com.google.api.services.docs.v1.model.StructuralElement; import com.google.api.services.docs.v1.model.Tab; import com.google.api.services.docs.v1.model.TableCell; import com.google.api.services.docs.v1.model.TableRow; import com.google.api.services.docs.v1.model.TextRun; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.security.GeneralSecurityException; import java.util.ArrayList; import java.util.Collections; import java.util.List; public class ExtractText { private static final String APPLICATION_NAME = "Google Docs API Extract Guide"; private static final JsonFactory JSON_FACTORY = JacksonFactory.getDefaultInstance(); private static final String TOKENS_DIRECTORY_PATH = "tokens"; private static final String DOCUMENT_ID = "YOUR_DOCUMENT_ID"; /** * Global instance of the scopes required by this quickstart. If modifying these scopes, delete * your previously saved tokens/ folder. */ private static final List<String> SCOPES = Collections.singletonList(DocsScopes.DOCUMENTS_READONLY); private static final String CREDENTIALS_FILE_PATH = "/credentials.json"; /** * Creates an authorized Credential object. * * @param HTTP_TRANSPORT The network HTTP Transport. * @return An authorized Credential object. * @throws IOException If the credentials.json file cannot be found. */ private static Credential getCredentials(final NetHttpTransport HTTP_TRANSPORT) throws IOException { // Load client secrets. InputStream in = ExtractText.class.getResourceAsStream(CREDENTIALS_FILE_PATH); GoogleClientSecrets clientSecrets = GoogleClientSecrets.load(JSON_FACTORY, new InputStreamReader(in)); // Build flow and trigger user authorization request. GoogleAuthorizationCodeFlow flow = new GoogleAuthorizationCodeFlow.Builder(HTTP_TRANSPORT, JSON_FACTORY, clientSecrets, SCOPES) .setDataStoreFactory(new FileDataStoreFactory(new java.io.File(TOKENS_DIRECTORY_PATH))) .setAccessType("offline") .build(); LocalServerReceiver receiver = new LocalServerReceiver.Builder().setPort(8888).build(); return new AuthorizationCodeInstalledApp(flow, receiver).authorize("user"); } /** * Adds the provided tab to the list of all tabs, and recurses through and * adds all child tabs. */ private void addCurrentAndChildTabs(Tab tab, List<Tab> allTabs) { allTabs.add(tab); for (Tab tab: tab.getChildTabs()) { addCurrentAndChildTabs(tab, allTabs); } } /** * Returns a flat list of all tabs in the document in the order they would * appear in the UI (top-down ordering). Includes all child tabs. */ private List<Tab> getAllTabs(Document doc) { List<Tab> allTabs = new ArrayList<>(); // Iterate over all tabs and recursively add any child tabs to generate a // flat list of Tabs. for (Tab tab: doc.getTabs()) { addCurrentAndChildTabs(tab, allTabs); } return allTabs; } /** * Returns the text in the given ParagraphElement. * * @param element a ParagraphElement from a Google Doc */ private static String readParagraphElement(ParagraphElement element) { TextRun run = element.getTextRun(); if (run == null || run.getContent() == null) { // The TextRun can be null if there is an inline object. return ""; } return run.getContent(); } /** * Recurses through a list of Structural Elements to read a document's text where text may be in * nested elements. * * @param elements a list of Structural Elements */ private static String readStructuralElements(List<StructuralElement> elements) { StringBuilder sb = new StringBuilder(); for (StructuralElement element : elements) { if (element.getParagraph() != null) { for (ParagraphElement paragraphElement : element.getParagraph().getElements()) { sb.append(readParagraphElement(paragraphElement)); } } else if (element.getTable() != null) { // The text in table cells are in nested Structural Elements and tables may be // nested. for (TableRow row : element.getTable().getTableRows()) { for (TableCell cell : row.getTableCells()) { sb.append(readStructuralElements(cell.getContent())); } } } else if (element.getTableOfContents() != null) { // The text in the TOC is also in a Structural Element. sb.append(readStructuralElements(element.getTableOfContents().getContent())); } } return sb.toString(); } public static void main(String... args) throws IOException, GeneralSecurityException { // Build a new authorized API client service. final NetHttpTransport HTTP_TRANSPORT = GoogleNetHttpTransport.newTrustedTransport(); Docs service = new Docs.Builder(HTTP_TRANSPORT, JSON_FACTORY, getCredentials(HTTP_TRANSPORT)) .setApplicationName(APPLICATION_NAME) .build(); // Fetch the document with all of the tabs populated, including any nested // child tabs. Document doc = service.documents().get(DOCUMENT_ID).setIncludeTabsContent(true).execute(); List<Tab> allTabs = getAllTabs(doc); // Print the text from each tab in the document. for (Tab tab: allTabs) { // Get the DocumentTab from the generic Tab. DocumentTab documentTab = tab.getDocumentTab(); System.out.println( readStructuralElements(documentTab.getBody().getContent())); } } }
Python
# Copyright 2019 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Recursively extracts the text from a Google Doc. """ import googleapiclient.discovery as discovery from httplib2 import Http from oauth2client import client from oauth2client import file from oauth2client import tools SCOPES = 'https://www.googleapis.com/auth/documents.readonly' DISCOVERY_DOC = 'https://docs.googleapis.com/$discovery/rest?version=v1' DOCUMENT_ID = 'YOUR_DOCUMENT_ID' def get_credentials(): """Gets valid user credentials from storage. If nothing has been stored, or if the stored credentials are invalid, the OAuth 2.0 flow is completed to obtain the new credentials. Returns: Credentials, the obtained credential. """ store = file.Storage('token.json') credentials = store.get() if not credentials or credentials.invalid: flow = client.flow_from_clientsecrets('credentials.json', SCOPES) credentials = tools.run_flow(flow, store) return credentials def add_current_and_child_tabs(tab, all_tabs): """Adds the provided tab to the list of all tabs, and recurses through and adds all child tabs. Args: tab: a Tab from a Google Doc. all_tabs: a list of all tabs in the document. """ all_tabs.append(tab) for tab in tab.get('childTabs'): add_current_and_child_tabs(tab, all_tabs) def get_all_tabs(doc): """Returns a flat list of all tabs in the document in the order they would appear in the UI (top-down ordering). Includes all child tabs. Args: doc: a document. """ all_tabs = [] # Iterate over all tabs and recursively add any child tabs to generate a # flat list of Tabs. for tab in doc.get('tabs'): add_current_and_child_tabs(tab, all_tabs) return all_tabs def read_paragraph_element(element): """Returns the text in the given ParagraphElement. Args: element: a ParagraphElement from a Google Doc. """ text_run = element.get('textRun') if not text_run: return '' return text_run.get('content') def read_structural_elements(elements): """Recurses through a list of Structural Elements to read a document's text where text may be in nested elements. Args: elements: a list of Structural Elements. """ text = '' for value in elements: if 'paragraph' in value: elements = value.get('paragraph').get('elements') for elem in elements: text += read_paragraph_element(elem) elif 'table' in value: # The text in table cells are in nested Structural Elements and tables may # be nested. table = value.get('table') for row in table.get('tableRows'): cells = row.get('tableCells') for cell in cells: text += read_structural_elements(cell.get('content')) elif 'tableOfContents' in value: # The text in the TOC is also in a Structural Element. toc = value.get('tableOfContents') text += read_structural_elements(toc.get('content')) return text def main(): """Uses the Docs API to print out the text of a document.""" credentials = get_credentials() http = credentials.authorize(Http()) docs_service = discovery.build( 'docs', 'v1', http=http, discoveryServiceUrl=DISCOVERY_DOC ) # Fetch the document with all of the tabs populated, including any nested # child tabs. doc = ( docs_service.documents() .get(documentId=DOCUMENT_ID, include_tabs_content=True) .execute() ) all_tabs = get_all_tabs(doc) # Print the text from each tab in the document. for tab in all_tabs: # Get the DocumentTab from the generic Tab. document_tab = tab.get('documentTab') doc_content = document_tab.get('body').get('content') print(read_structural_elements(doc_content)) if __name__ == '__main__': main()